From 7cda2e3ce0d180688250856566b6c75ca07d7711 Mon Sep 17 00:00:00 2001 From: Alexander Timofeev Date: Sun, 21 Apr 2024 15:52:04 +0200 Subject: [PATCH] [AMDGPU] Change CF intrinsics lowering to reconverge on predecessors. Change-Id: I8609c5abae7cd9307ffc4f6ace5011be860998e8 --- .../CodeGenCUDA/atomics-remarks-gfx90a.cu | 2 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 +- .../AMDGPU/AMDGPUInstructionSelector.cpp | 9 +- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 2 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 19 +- .../Target/AMDGPU/SIAnnotateControlFlow.cpp | 92 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 34 +- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 43 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 12 +- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 544 ++-- .../Target/AMDGPU/SIOptimizeExecMasking.cpp | 4 +- .../Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp | 2 +- llvm/test/%t | 1 + .../AMDGPU/MIR/hidden-diverge-gmir.mir | 4 +- .../AMDGPU/MIR/temporal-divergence.mir | 21 - .../AMDGPU/MIR/uses-value-from-cycle.mir | 4 +- .../AMDGPU/deprecated/hidden-diverge.mir | 4 +- .../atomic_optimizations_mul_one.ll | 2 +- ...-divergent-i1-phis-no-lane-mask-merging.ll | 26 +- ...divergent-i1-phis-no-lane-mask-merging.mir | 14 +- ...vergence-divergent-i1-used-outside-loop.ll | 179 +- ...ergence-divergent-i1-used-outside-loop.mir | 98 +- .../GlobalISel/divergence-structurizer.ll | 160 +- .../GlobalISel/divergence-structurizer.mir | 116 +- .../divergence-temporal-divergent-i1.ll | 29 +- .../divergence-temporal-divergent-i1.mir | 24 +- .../divergence-temporal-divergent-reg.ll | 7 +- .../divergence-temporal-divergent-reg.mir | 2 - .../GlobalISel/divergent-control-flow.ll | 38 +- .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 144 +- .../global-atomic-fadd.f32-no-rtn.ll | 4 +- .../GlobalISel/global-atomic-fadd.f32-rtn.ll | 4 +- .../GlobalISel/image-waterfall-loop-O0.ll | 76 +- .../GlobalISel/irtranslator-atomicrmw.ll | 2 - .../GlobalISel/irtranslator-function-args.ll | 6 +- .../AMDGPU/GlobalISel/is-safe-to-sink-bug.ll | 15 +- .../GlobalISel/llvm.amdgcn.end.cf.i32.ll | 12 +- .../GlobalISel/llvm.amdgcn.end.cf.i64.ll | 11 +- .../GlobalISel/llvm.amdgcn.intersect_ray.ll | 140 +- .../llvm.amdgcn.make.buffer.rsrc.ll | 7 +- .../llvm.amdgcn.raw.buffer.atomic.add.ll | 28 +- .../llvm.amdgcn.raw.buffer.atomic.cmpswap.ll | 56 +- .../llvm.amdgcn.raw.buffer.atomic.fadd.ll | 28 +- .../llvm.amdgcn.raw.buffer.load.format.f16.ll | 21 +- .../llvm.amdgcn.raw.buffer.load.format.ll | 14 +- .../GlobalISel/llvm.amdgcn.raw.buffer.load.ll | 84 +- ...llvm.amdgcn.raw.buffer.store.format.f16.ll | 42 +- ...llvm.amdgcn.raw.buffer.store.format.f32.ll | 28 +- .../llvm.amdgcn.raw.buffer.store.ll | 84 +- .../llvm.amdgcn.raw.ptr.buffer.atomic.add.ll | 14 +- ...vm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll | 28 +- .../llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll | 28 +- ...m.amdgcn.raw.ptr.buffer.load.format.f16.ll | 14 +- .../llvm.amdgcn.raw.ptr.buffer.load.format.ll | 7 +- .../llvm.amdgcn.raw.ptr.buffer.load.ll | 42 +- ....amdgcn.raw.ptr.buffer.store.format.f16.ll | 28 +- ....amdgcn.raw.ptr.buffer.store.format.f32.ll | 14 +- .../llvm.amdgcn.raw.ptr.buffer.store.ll | 42 +- .../llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll | 14 +- .../llvm.amdgcn.raw.ptr.tbuffer.load.ll | 7 +- .../llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll | 42 +- .../llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll | 42 +- .../llvm.amdgcn.raw.ptr.tbuffer.store.ll | 35 +- .../llvm.amdgcn.raw.tbuffer.load.f16.ll | 21 +- .../llvm.amdgcn.raw.tbuffer.load.ll | 14 +- .../llvm.amdgcn.raw.tbuffer.store.f16.ll | 63 +- .../llvm.amdgcn.raw.tbuffer.store.i8.ll | 63 +- .../llvm.amdgcn.raw.tbuffer.store.ll | 70 +- .../GlobalISel/llvm.amdgcn.s.buffer.load.ll | 336 +-- .../llvm.amdgcn.struct.buffer.atomic.add.ll | 28 +- ...lvm.amdgcn.struct.buffer.atomic.cmpswap.ll | 56 +- .../llvm.amdgcn.struct.buffer.atomic.fadd.ll | 28 +- ...vm.amdgcn.struct.buffer.load.format.f16.ll | 21 +- .../llvm.amdgcn.struct.buffer.load.format.ll | 14 +- .../llvm.amdgcn.struct.buffer.load.ll | 14 +- ...m.amdgcn.struct.buffer.store.format.f16.ll | 21 +- ...m.amdgcn.struct.buffer.store.format.f32.ll | 14 +- .../llvm.amdgcn.struct.buffer.store.ll | 14 +- ...lvm.amdgcn.struct.ptr.buffer.atomic.add.ll | 14 +- ...amdgcn.struct.ptr.buffer.atomic.cmpswap.ll | 28 +- ...vm.amdgcn.struct.ptr.buffer.atomic.fadd.ll | 28 +- ...mdgcn.struct.ptr.buffer.load.format.f16.ll | 14 +- ...vm.amdgcn.struct.ptr.buffer.load.format.ll | 7 +- .../llvm.amdgcn.struct.ptr.buffer.load.ll | 7 +- ...dgcn.struct.ptr.buffer.store.format.f16.ll | 14 +- ...dgcn.struct.ptr.buffer.store.format.f32.ll | 7 +- .../llvm.amdgcn.struct.ptr.buffer.store.ll | 7 +- ...llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll | 14 +- .../llvm.amdgcn.struct.ptr.tbuffer.load.ll | 7 +- .../llvm.amdgcn.struct.tbuffer.load.f16.ll | 21 +- .../llvm.amdgcn.struct.tbuffer.load.ll | 14 +- .../GlobalISel/llvm.amdgcn.wqm.demote.ll | 384 +-- .../CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll | 27 +- .../CodeGen/AMDGPU/GlobalISel/localizer.ll | 45 +- .../AMDGPU/GlobalISel/mul-known-bits.i64.ll | 35 +- .../AMDGPU/GlobalISel/non-entry-alloca.ll | 42 +- .../regbankselect-amdgcn-s-buffer-load.mir | 12 +- .../regbankselect-amdgcn.image.load.1d.ll | 24 +- .../regbankselect-amdgcn.image.sample.1d.ll | 36 +- .../regbankselect-amdgcn.raw.buffer.load.ll | 18 +- ...egbankselect-amdgcn.raw.ptr.buffer.load.ll | 18 +- .../regbankselect-amdgcn.s.buffer.load.ll | 120 +- ...regbankselect-amdgcn.struct.buffer.load.ll | 18 +- ...egbankselect-amdgcn.struct.buffer.store.ll | 18 +- ...ankselect-amdgcn.struct.ptr.buffer.load.ll | 18 +- ...nkselect-amdgcn.struct.ptr.buffer.store.ll | 18 +- .../regbankselect-waterfall-agpr.mir | 12 +- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 519 ++-- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 443 +-- .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 153 +- .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 145 +- llvm/test/CodeGen/AMDGPU/amdpal-callable.ll | 1 + .../AMDGPU/atomic-optimizer-strict-wqm.ll | 36 +- .../AMDGPU/atomic_optimizations_buffer.ll | 546 ++-- .../atomic_optimizations_global_pointer.ll | 1248 ++++---- .../atomic_optimizations_local_pointer.ll | 1317 +++++---- .../atomic_optimizations_pixelshader.ll | 365 ++- .../AMDGPU/atomic_optimizations_raw_buffer.ll | 468 +-- .../atomic_optimizations_struct_buffer.ll | 468 +-- llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 211 +- llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll | 21 +- .../AMDGPU/atomics-cas-remarks-gfx90a.ll | 1 + .../AMDGPU/bb-prolog-spill-during-regalloc.ll | 44 +- .../block-should-not-be-in-alive-blocks.mir | 27 +- .../CodeGen/AMDGPU/branch-condition-and.ll | 1 + .../branch-folding-implicit-def-subreg.ll | 785 ++--- ...anch-relaxation-gfx10-branch-offset-bug.ll | 1 + llvm/test/CodeGen/AMDGPU/branch-relaxation.ll | 85 +- .../AMDGPU/bug-sdag-emitcopyfromreg.ll | 71 +- llvm/test/CodeGen/AMDGPU/bypass-div.ll | 96 +- llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll | 1 + llvm/test/CodeGen/AMDGPU/call-skip.ll | 1 + .../AMDGPU/cgp-addressing-modes-flat.ll | 232 +- .../AMDGPU/cgp-addressing-modes-gfx1030.ll | 2 +- .../AMDGPU/cgp-addressing-modes-gfx908.ll | 8 +- .../CodeGen/AMDGPU/cgp-addressing-modes.ll | 1 + .../codegen-prepare-addrspacecast-non-null.ll | 58 +- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 933 +++--- llvm/test/CodeGen/AMDGPU/collapse-endcf.mir | 501 ++-- .../AMDGPU/constant-fold-imm-immreg.mir | 2 +- .../AMDGPU/control-flow-fastregalloc.ll | 1 + .../CodeGen/AMDGPU/control-flow-optnone.ll | 1 + .../test/CodeGen/AMDGPU/convergence-tokens.ll | 1 + .../CodeGen/AMDGPU/convergent-inlineasm.ll | 1 + llvm/test/CodeGen/AMDGPU/cse-convergent.ll | 27 +- .../CodeGen/AMDGPU/cse-phi-incoming-val.ll | 1 + .../CodeGen/AMDGPU/dag-divergence-atomic.ll | 21 +- .../CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll | 2 +- .../dagcombine-v1i8-extractvecelt-crash.ll | 8 +- llvm/test/CodeGen/AMDGPU/div_i128.ll | 768 +++-- llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 918 +++--- .../divergent-branch-uniform-condition.ll | 25 +- llvm/test/CodeGen/AMDGPU/dpp_combine.mir | 2 +- .../test/CodeGen/AMDGPU/dpp_combine_gfx11.mir | 2 +- .../early-tailduplicator-terminator.mir | 16 +- llvm/test/CodeGen/AMDGPU/else.ll | 1 + llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll | 1 + .../AMDGPU/fix-frame-ptr-reg-copy-livein.ll | 1 + .../CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll | 4 - .../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 1266 ++++---- .../CodeGen/AMDGPU/flat_atomics_i64_system.ll | 1266 ++++---- llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 37 +- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 20 +- llvm/test/CodeGen/AMDGPU/fold-fabs.ll | 37 +- .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 144 +- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 830 +++--- .../CodeGen/AMDGPU/frame-index-elimination.ll | 1 + llvm/test/CodeGen/AMDGPU/function-args.ll | 19 +- .../AMDGPU/global-atomic-fadd.f32-no-rtn.ll | 12 +- .../AMDGPU/global-atomic-fadd.f32-rtn.ll | 4 +- .../global-atomics-fp-wrong-subtarget.ll | 1 + llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll | 744 +++-- .../global-saddr-atomics-min-max-system.ll | 688 +++-- .../AMDGPU/global_atomics_i32_system.ll | 1266 ++++---- .../AMDGPU/global_atomics_i64_system.ll | 1266 ++++---- .../AMDGPU/global_atomics_scan_fadd.ll | 2570 ++++++++++------- .../AMDGPU/global_atomics_scan_fmax.ll | 1926 +++++++----- .../AMDGPU/global_atomics_scan_fmin.ll | 1926 +++++++----- .../AMDGPU/global_atomics_scan_fsub.ll | 2408 +++++++++------ llvm/test/CodeGen/AMDGPU/hoist-cond.ll | 1 + llvm/test/CodeGen/AMDGPU/hsa.ll | 1 - llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll | 28 +- llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll | 1 + .../i1_copy_phi_with_phi_incoming_value.mir | 27 +- .../identical-subrange-spill-infloop.ll | 534 ++-- .../CodeGen/AMDGPU/image-sample-waterfall.ll | 12 +- llvm/test/CodeGen/AMDGPU/indirect-call.ll | 318 +- llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 1 + llvm/test/CodeGen/AMDGPU/inline-asm.ll | 1 + .../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 30 +- .../insert_waitcnt_for_precise_memory.ll | 232 +- llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll | 114 +- llvm/test/CodeGen/AMDGPU/itofp.i128.ll | 724 +++-- .../test/CodeGen/AMDGPU/kill-infinite-loop.ll | 90 +- .../AMDGPU/lds-global-non-entry-func.ll | 176 +- ...alize-amdgcn.raw.buffer.load.format.f16.ll | 25 +- .../legalize-amdgcn.raw.buffer.load.format.ll | 30 +- .../AMDGPU/legalize-amdgcn.raw.buffer.load.ll | 175 +- ...lize-amdgcn.raw.buffer.store.format.f16.ll | 55 +- ...lize-amdgcn.raw.buffer.store.format.f32.ll | 60 +- .../legalize-amdgcn.raw.buffer.store.ll | 165 +- ...e-amdgcn.raw.ptr.buffer.load.format.f16.ll | 25 +- ...alize-amdgcn.raw.ptr.buffer.load.format.ll | 30 +- .../legalize-amdgcn.raw.ptr.buffer.load.ll | 175 +- ...-amdgcn.raw.ptr.buffer.store.format.f16.ll | 55 +- ...-amdgcn.raw.ptr.buffer.store.format.f32.ll | 60 +- .../legalize-amdgcn.raw.ptr.buffer.store.ll | 165 +- ...egalize-amdgcn.raw.ptr.tbuffer.load.f16.ll | 40 +- .../legalize-amdgcn.raw.ptr.tbuffer.load.ll | 45 +- ...galize-amdgcn.raw.ptr.tbuffer.store.f16.ll | 50 +- .../legalize-amdgcn.raw.ptr.tbuffer.store.ll | 125 +- .../legalize-amdgcn.raw.tbuffer.load.f16.ll | 40 +- .../legalize-amdgcn.raw.tbuffer.load.ll | 45 +- .../legalize-amdgcn.raw.tbuffer.store.f16.ll | 50 +- .../legalize-amdgcn.raw.tbuffer.store.ll | 125 +- .../CodeGen/AMDGPU/legalize-soffset-mbuf.ll | 160 +- .../CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll | 1 + .../AMDGPU/llvm.amdgcn.ds.ordered.swap.ll | 1 + llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll | 1 + .../AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll | 24 +- .../AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll | 24 +- .../CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll | 1 + .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 266 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 266 +- .../CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll | 1 + .../CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll | 32 +- ....amdgcn.struct.buffer.load.format.v3f16.ll | 42 +- ...gcn.struct.ptr.buffer.load.format.v3f16.ll | 33 +- .../CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll | 372 +-- llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll | 812 +++--- .../AMDGPU/long-branch-reserve-register.ll | 20 +- .../loop-live-out-copy-undef-subrange.ll | 7 +- .../AMDGPU/loop-on-function-argument.ll | 8 +- llvm/test/CodeGen/AMDGPU/loop_break.ll | 42 +- .../test/CodeGen/AMDGPU/loop_exit_with_xor.ll | 38 +- .../lower-control-flow-live-intervals.mir | 106 +- ...wer-control-flow-live-variables-update.mir | 197 +- ...ntrol-flow-live-variables-update.xfail.mir | 3 +- .../lower-control-flow-other-terminators.mir | 72 +- .../AMDGPU/lower-i1-copies-clear-kills.mir | 12 +- .../machine-sink-ignorable-exec-use.mir | 29 +- .../CodeGen/AMDGPU/machine-sink-lane-mask.mir | 8 +- ...p-var-out-of-divergent-loop-swdev407790.ll | 40 +- ...-var-out-of-divergent-loop-swdev407790.mir | 8 +- ...ne-sink-temporal-divergence-swdev407790.ll | 407 +-- ...e-sink-temporal-divergence-swdev407790.mir | 4 - llvm/test/CodeGen/AMDGPU/madmk.ll | 1 + .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll | 57 +- .../CodeGen/AMDGPU/mixed-wave32-wave64.ll | 1 + llvm/test/CodeGen/AMDGPU/mmra.ll | 8 +- .../AMDGPU/move-to-valu-atomicrmw-system.ll | 42 +- .../CodeGen/AMDGPU/move-to-valu-atomicrmw.ll | 10 +- .../AMDGPU/move-to-valu-vimage-vsample.ll | 30 +- ...uf-legalize-operands-non-ptr-intrinsics.ll | 567 ++-- .../CodeGen/AMDGPU/mubuf-legalize-operands.ll | 565 ++-- .../AMDGPU/mubuf-legalize-operands.mir | 40 +- .../CodeGen/AMDGPU/mul24-pass-ordering.ll | 24 +- .../AMDGPU/multi-divergent-exit-region.ll | 1 + llvm/test/CodeGen/AMDGPU/multilevel-break.ll | 6 +- .../CodeGen/AMDGPU/nested-loop-conditions.ll | 3 +- .../CodeGen/AMDGPU/no-dup-inst-prefetch.ll | 36 +- llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 92 +- .../CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir | 12 +- .../AMDGPU/pal-metadata-3.0-callable.ll | 1 + .../CodeGen/AMDGPU/phi-elimination-end-cf.mir | 8 +- llvm/test/CodeGen/AMDGPU/rem_i128.ll | 330 ++- ...emove-incompatible-extended-image-insts.ll | 5 +- llvm/test/CodeGen/AMDGPU/ret_jump.ll | 1 + ...calc-one-successor-two-predecessors-bug.ll | 20 +- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 184 +- .../AMDGPU/set-inactive-wwm-overwrite.ll | 48 +- llvm/test/CodeGen/AMDGPU/setcc-sext.ll | 1 + llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll | 45 +- .../AMDGPU/should-not-hoist-set-inactive.ll | 46 +- .../CodeGen/AMDGPU/si-annotate-cf-kill.ll | 36 +- .../CodeGen/AMDGPU/si-annotate-cf-noloop.ll | 1 + .../AMDGPU/si-annotate-cf-unreachable.ll | 1 + llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll | 48 +- .../CodeGen/AMDGPU/si-annotate-dbg-info.ll | 12 +- .../si-annotate-nested-control-flows.ll | 1 + .../si-annotatecfg-multiple-backedges.ll | 9 +- .../CodeGen/AMDGPU/si-fix-sgpr-copies.mir | 11 +- .../AMDGPU/si-lower-control-flow-kill.ll | 1 + ...si-lower-control-flow-unreachable-block.ll | 1 + .../CodeGen/AMDGPU/si-lower-control-flow.mir | 130 +- ...lower-i1-copies-order-of-phi-incomings.mir | 4 +- .../CodeGen/AMDGPU/si-lower-i1-copies.mir | 3 +- .../si-opt-vgpr-liverange-bug-deadlanes.mir | 4 +- .../si-optimize-vgpr-live-range-dbg-instr.ll | 30 +- .../si-optimize-vgpr-live-range-dbg-instr.mir | 3 +- .../si-unify-exit-multiple-unreachables.ll | 78 +- .../si-unify-exit-return-unreachable.ll | 13 +- llvm/test/CodeGen/AMDGPU/sibling-call.ll | 2 +- llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll | 1 + llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 331 ++- .../test/CodeGen/AMDGPU/spill-cfg-position.ll | 1 + .../CodeGen/AMDGPU/spill-scavenge-offset.ll | 130 +- llvm/test/CodeGen/AMDGPU/srem64.ll | 184 +- ...tack-pointer-offset-relative-frameindex.ll | 24 +- .../CodeGen/AMDGPU/stacksave_stackrestore.ll | 103 +- .../AMDGPU/stale-livevar-in-twoaddr-pass.mir | 2 +- .../stop-tail-duplicate-cfg-intrinsic.mir | 4 +- .../AMDGPU/subreg-coalescer-undef-use.ll | 7 +- .../transform-block-with-return-to-epilog.ll | 72 +- .../AMDGPU/tuple-allocation-failure.ll | 208 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 194 +- llvm/test/CodeGen/AMDGPU/uniform-cfg.ll | 40 +- .../AMDGPU/uniform-loop-inside-nonuniform.ll | 1 + .../CodeGen/AMDGPU/uniform-phi-with-undef.ll | 11 +- .../AMDGPU/unstructured-cfg-def-use-issue.ll | 121 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 143 +- llvm/test/CodeGen/AMDGPU/valu-i1.ll | 1 + .../CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll | 1 + ...r-descriptor-waterfall-loop-idom-update.ll | 18 +- llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll | 81 +- llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll | 211 +- .../AMDGPU/vgpr-mark-last-scratch-load.ll | 19 +- .../AMDGPU/vgpr-spill-placement-issue61083.ll | 29 +- .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 1570 +++++----- .../CodeGen/AMDGPU/waterfall_kills_scc.ll | 32 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 230 +- llvm/test/CodeGen/AMDGPU/while-break.ll | 112 +- llvm/test/CodeGen/AMDGPU/wqm.ll | 494 ++-- .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 99 +- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 150 +- ...dgpu_generated_funcs.ll.generated.expected | 23 +- ...pu_generated_funcs.ll.nogenerated.expected | 23 +- 327 files changed, 24983 insertions(+), 20612 deletions(-) create mode 100644 llvm/test/%t diff --git a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu index 946927d88a1ee..3ca766755a631 100644 --- a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu +++ b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu @@ -10,7 +10,7 @@ // GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope // GFX90A-CAS-LABEL: _Z14atomic_add_casPf // GFX90A-CAS: flat_atomic_cmpswap -// GFX90A-CAS: s_cbranch_execnz +// GFX90A-CAS: s_cbranch_scc1 __device__ float atomic_add_cas(float *p) { return __atomic_fetch_add(p, 1.0f, memory_order_relaxed); } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index be8048ca2459c..75ad7ed5e3fa2 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3172,8 +3172,8 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] >; -def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], - [IntrWillReturn, IntrNoCallback, IntrNoFree]>; +def int_amdgcn_wave_reconverge : Intrinsic<[], [llvm_anyint_ty], + [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // Represent unreachable in a divergent region. def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index b48a09489653a..9374933986080 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1553,11 +1553,12 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { return true; } -bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { +bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic( + MachineInstr &MI) const { // FIXME: Manually selecting to avoid dealing with the SReg_1 trick // SelectionDAG uses for wave32 vs wave64. MachineBasicBlock *BB = MI.getParent(); - BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) + BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_WAVE_RECONVERGE)) .add(MI.getOperand(1)); Register Reg = MI.getOperand(1).getReg(); @@ -2083,8 +2084,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MachineInstr &I) const { unsigned IntrinsicID = cast(I).getIntrinsicID(); switch (IntrinsicID) { - case Intrinsic::amdgcn_end_cf: - return selectEndCfIntrinsic(I); + case Intrinsic::amdgcn_wave_reconverge: + return selectWaveReconvergeIntrinsic(I); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: return selectDSOrderedIntrinsic(I, IntrinsicID); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index f561d5d29efc4..44c89684893f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -119,7 +119,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectReturnAddress(MachineInstr &I) const; bool selectG_INTRINSIC(MachineInstr &I) const; - bool selectEndCfIntrinsic(MachineInstr &MI) const; + bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const; bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 56345d14a331c..368cc98b9a585 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -785,8 +785,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); const unsigned MovExecOpc = Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - const unsigned MovExecTermOpc = - Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; const unsigned XorTermOpc = Subtarget.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; @@ -949,9 +947,11 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( B.setInsertPt(*BodyBB, BodyBB->end()); + Register LoopMask = MRI.createVirtualRegister( + TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); // Update EXEC, switch all done bits to 0 and all todo bits to 1. B.buildInstr(XorTermOpc) - .addDef(ExecReg) + .addDef(LoopMask) .addReg(ExecReg) .addReg(NewExec); @@ -959,18 +959,15 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( // s_cbranch_scc0? // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. - B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); + B.buildInstr(AMDGPU::SI_WATERFALL_LOOP) + .addReg(LoopMask) + .addReg(NewExec) + .addMBB(LoopBB); // Save the EXEC mask before the loop. BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) .addReg(ExecReg); - // Restore the EXEC mask after the loop. - B.setMBB(*RestoreExecBB); - B.buildInstr(MovExecTermOpc) - .addDef(ExecReg) - .addReg(SaveExecReg); - // Set the insert point after the original instruction, so any new // instructions will be in the remainder. B.setInsertPt(*RemainderBB, RemainderBB->begin()); @@ -4954,7 +4951,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } - case Intrinsic::amdgcn_end_cf: { + case Intrinsic::amdgcn_wave_reconverge: { unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 08e1d6b87b0df..68d81a6ffaaff 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -15,6 +15,7 @@ #include "GCNSubtarget.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/UniformityAnalysis.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -53,7 +54,7 @@ class SIAnnotateControlFlow : public FunctionPass { Function *Else; Function *IfBreak; Function *Loop; - Function *EndCf; + Function *WaveReconverge; DominatorTree *DT; StackVector Stack; @@ -86,7 +87,7 @@ class SIAnnotateControlFlow : public FunctionPass { bool handleLoop(BranchInst *Term); - bool closeControlFlow(BasicBlock *BB); + bool tryWaveReconverge(BasicBlock *BB); public: static char ID; @@ -141,7 +142,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) { IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break, { IntMask }); Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask }); - EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask }); + WaveReconverge = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_wave_reconverge, { IntMask }); } /// Is the branch condition uniform or did the StructurizeCFG pass @@ -203,8 +204,6 @@ bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { /// Open a new "If" block bool SIAnnotateControlFlow::openIf(BranchInst *Term) { - if (isUniform(Term)) - return false; IRBuilder<> IRB(Term); Value *IfCall = IRB.CreateCall(If, {Term->getCondition()}); @@ -305,43 +304,43 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) { } /// Close the last opened control flow -bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { - llvm::Loop *L = LI->getLoopFor(BB); +bool SIAnnotateControlFlow::tryWaveReconverge(BasicBlock *BB) { - assert(Stack.back().first == BB); + if (succ_empty(BB)) + return false; - if (L && L->getHeader() == BB) { - // We can't insert an EndCF call into a loop header, because it will - // get executed on every iteration of the loop, when it should be - // executed only once before the loop. - SmallVector Latches; - L->getLoopLatches(Latches); + BranchInst *Term = dyn_cast(BB->getTerminator()); + if (Term->getNumSuccessors() == 1) { + // The current BBs single successor is a top of the stack. We need to + // reconverge over thaqt path. + BasicBlock *SingleSucc = *succ_begin(BB); + BasicBlock::iterator InsPt = Term ? BasicBlock::iterator(Term) : BB->end(); - SmallVector Preds; - for (BasicBlock *Pred : predecessors(BB)) { - if (!is_contained(Latches, Pred)) - Preds.push_back(Pred); + if (isTopOfStack(SingleSucc)) { + Value *Exec = Stack.back().second; + IRBuilder<>(BB, InsPt).CreateCall(WaveReconverge, {Exec}); } - - BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr, - false); - } - - Value *Exec = popSaved(); - BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt(); - if (!isa(Exec) && !isa(FirstInsertionPt)) { - Instruction *ExecDef = cast(Exec); - BasicBlock *DefBB = ExecDef->getParent(); - if (!DT->dominates(DefBB, BB)) { - // Split edge to make Def dominate Use - FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt(); + } else { + // We have a uniform conditional branch terminating the block. + // THis block may be the last in the Then path of the enclosing divergent + // IF. + if (!isUniform(Term)) + // Divergent loop is going to be further processed in another place + return false; + + for (auto Succ : Term->successors()) { + if (isTopOfStack(Succ)) { + // Just split to make a room for further WAVE_RECONVERGE insertion + SmallVector Preds; + for (auto P : predecessors(Succ)) { + if (DT->dominates(BB, P)) + Preds.push_back(P); + } + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + SplitBlockPredecessors(Succ, Preds, ".reconverge", &DTU, LI, + nullptr, false); + } } - IRBuilder<> IRB(FirstInsertionPt->getParent(), FirstInsertionPt); - // TODO: StructurizeCFG 'Flow' blocks have debug locations from the - // condition, for now just avoid copying these DebugLocs so that stepping - // out of the then/else block in a debugger doesn't step to the condition. - IRB.SetCurrentDebugLocation(DebugLoc()); - IRB.CreateCall(EndCf, {Exec}); } return true; @@ -365,14 +364,20 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { if (!Term || Term->isUnconditional()) { if (isTopOfStack(BB)) - Changed |= closeControlFlow(BB); + Stack.pop_back(); + + Changed |= tryWaveReconverge(BB); continue; } if (I.nodeVisited(Term->getSuccessor(1))) { if (isTopOfStack(BB)) - Changed |= closeControlFlow(BB); + Stack.pop_back(); + + // Let's take care of uniform loop latch that may be closing the Then + // path of the enclosing divergent branch. + Changed |= tryWaveReconverge(BB); if (DT->dominates(Term->getSuccessor(1), BB)) Changed |= handleLoop(Term); @@ -387,10 +392,15 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { continue; } - Changed |= closeControlFlow(BB); + Stack.pop_back(); } - Changed |= openIf(Term); + if (isUniform(Term)) + // Uniform conditional branch may be in the block that closes the Then + // path of the divergent conditional branch. + Changed |= tryWaveReconverge(BB); + else + Changed |= openIf(Term); } if (!Stack.empty()) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d7b6941fcf81d..ea1e7c782e02d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6299,7 +6299,7 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { return AMDGPUISD::ELSE; case Intrinsic::amdgcn_loop: return AMDGPUISD::LOOP; - case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_wave_reconverge: llvm_unreachable("should not occur"); default: return 0; @@ -9940,8 +9940,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(Load, 0); } - case Intrinsic::amdgcn_end_cf: - return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, + case Intrinsic::amdgcn_wave_reconverge: + return SDValue(DAG.getMachineNode(AMDGPU::SI_WAVE_RECONVERGE, DL, MVT::Other, Op->getOperand(2), Chain), 0); case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_join: @@ -15740,6 +15740,32 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { } } + // ISel inserts copy to regs for the successor PHIs + // at the BB end. We need to move the SI_WAVE_RECONVERGE right before the + // branch. + for (auto &MBB : MF) { + for (auto &MI : MBB) { + if (MI.getOpcode() == AMDGPU::SI_WAVE_RECONVERGE) { + MachineBasicBlock::iterator I(MI); + MachineBasicBlock::iterator Next = std::next(I); + bool NeedToMove = false; + while (Next != MBB.end() && !Next->isBranch()) { + NeedToMove = true; + Next++; + } + + assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC, TRI)) && + "Malformed CFG detected!\n"); + + if (NeedToMove) { + MBB.splice(Next, &MBB, &MI); + } + + break; + } + } + } + // FIXME: This is a hack to fixup AGPR classes to use the properly aligned // classes if required. Ideally the register class constraints would differ // per-subtarget, but there's no easy way to achieve that right now. This is @@ -16336,7 +16362,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet &Visited, default: Result = false; break; - case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_wave_reconverge: case Intrinsic::amdgcn_loop: Result = true; break; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 08351c49b2231..3412846a5abd9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2103,12 +2103,36 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_MOV_B64)); break; + case AMDGPU::S_CMOV_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_CMOV_B64)); + break; + case AMDGPU::S_MOV_B32_term: // This is only a terminator to get the correct spill code placement during // register allocation. MI.setDesc(get(AMDGPU::S_MOV_B32)); break; + case AMDGPU::S_CMOV_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_CMOV_B32)); + break; + + case AMDGPU::S_CSELECT_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_CSELECT_B32)); + break; + + case AMDGPU::S_CSELECT_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_CSELECT_B64)); + break; + case AMDGPU::S_XOR_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. @@ -3088,20 +3112,25 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, while (I != E && !I->isBranch() && !I->isReturn()) { switch (I->getOpcode()) { case AMDGPU::S_MOV_B64_term: + case AMDGPU::S_CMOV_B64_term: case AMDGPU::S_XOR_B64_term: case AMDGPU::S_OR_B64_term: case AMDGPU::S_ANDN2_B64_term: case AMDGPU::S_AND_B64_term: case AMDGPU::S_AND_SAVEEXEC_B64_term: + case AMDGPU::S_CSELECT_B64_term: case AMDGPU::S_MOV_B32_term: + case AMDGPU::S_CMOV_B32_term: case AMDGPU::S_XOR_B32_term: case AMDGPU::S_OR_B32_term: case AMDGPU::S_ANDN2_B32_term: case AMDGPU::S_AND_B32_term: case AMDGPU::S_AND_SAVEEXEC_B32_term: + case AMDGPU::S_CSELECT_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: + case AMDGPU::SI_WAVE_RECONVERGE: case AMDGPU::SI_KILL_I1_TERMINATOR: case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: // FIXME: It's messy that these need to be considered here at all. @@ -6386,6 +6415,7 @@ static void emitLoadScalarOpsFromVGPRLoop( } Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); + Register LoopMask = MRI.createVirtualRegister(BoolXExecRC); MRI.setSimpleHint(SaveExec, CondReg); // Update EXEC to matching lanes, saving original to SaveExec. @@ -6396,11 +6426,14 @@ static void emitLoadScalarOpsFromVGPRLoop( I = BodyBB.end(); // Update EXEC, switch all done bits to 0 and all todo bits to 1. - BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec) + BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), LoopMask) .addReg(Exec) .addReg(SaveExec); - BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); + BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)) + .addReg(LoopMask) + .addReg(SaveExec) + .addMBB(&LoopBB); } // Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register @@ -6502,8 +6535,10 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, .addImm(0); } + // BuildMI(*BodyBB, BodyBB->end(), DL, TII.get(AMDGPU::S_BRANCH)) + // .addMBB(RemainderBB); // Restore the EXEC mask - BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); + // BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); return BodyBB; } @@ -8782,7 +8817,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, .add(Branch->getOperand(0)) .add(Branch->getOperand(1)); MachineInstr *SIEND = - BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) + BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_WAVE_RECONVERGE)) .addReg(DstReg); IfEntry->erase(TI); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index e7aeaa017306c..c526d5ad662eb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -350,6 +350,8 @@ class WrapTerminatorInst : SPseudoInstSI< let WaveSizePredicate = isWave64 in { def S_MOV_B64_term : WrapTerminatorInst; +def S_CMOV_B64_term : WrapTerminatorInst; +def S_CSELECT_B64_term : WrapTerminatorInst; def S_XOR_B64_term : WrapTerminatorInst; def S_OR_B64_term : WrapTerminatorInst; def S_ANDN2_B64_term : WrapTerminatorInst; @@ -359,6 +361,8 @@ def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst; let WaveSizePredicate = isWave32 in { def S_MOV_B32_term : WrapTerminatorInst; +def S_CMOV_B32_term : WrapTerminatorInst; +def S_CSELECT_B32_term : WrapTerminatorInst; def S_XOR_B32_term : WrapTerminatorInst; def S_OR_B32_term : WrapTerminatorInst; def S_ANDN2_B32_term : WrapTerminatorInst; @@ -460,7 +464,7 @@ def SI_ELSE : CFPseudoInstSI < def SI_WATERFALL_LOOP : CFPseudoInstSI < (outs), - (ins brtarget:$target), [], 1> { + (ins SReg_1:$LoopMask, SReg_1:$ExitMask, brtarget:$target), [], 1> { let Size = 8; let isBranch = 1; let Defs = []; @@ -475,9 +479,7 @@ def SI_LOOP : CFPseudoInstSI < let IsNeverUniform = 1; } -} // End isTerminator = 1 - -def SI_END_CF : CFPseudoInstSI < +def SI_WAVE_RECONVERGE : CFPseudoInstSI < (outs), (ins SReg_1:$saved), [], 1, 1> { let Size = 4; let isAsCheapAsAMove = 1; @@ -488,6 +490,8 @@ def SI_END_CF : CFPseudoInstSI < let mayStore = 1; } +} // End isTerminator = 1 + def SI_IF_BREAK : CFPseudoInstSI < (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> { let Size = 4; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index f178324dbbe24..99ecff2d95889 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -25,7 +25,7 @@ /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 /// %sgpr0 = SI_ELSE %sgpr0 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0 -/// SI_END_CF %sgpr0 +/// SI_WAVE_RECONVERGE %sgpr0 /// /// becomes: /// @@ -82,7 +82,12 @@ class SILowerControlFlow : public MachineFunctionPass { SmallSet RecomputeRegs; const TargetRegisterClass *BoolRC = nullptr; + long unsigned TestMask; + unsigned Select; + unsigned CmovOpc; unsigned AndOpc; + unsigned AndTermOpc; + unsigned Andn2Opc; unsigned OrOpc; unsigned XorOpc; unsigned MovTermOpc; @@ -92,16 +97,17 @@ class SILowerControlFlow : public MachineFunctionPass { unsigned OrSaveExecOpc; unsigned Exec; - bool EnableOptimizeEndCf = false; - bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End); void emitIf(MachineInstr &MI); void emitElse(MachineInstr &MI); void emitIfBreak(MachineInstr &MI); void emitLoop(MachineInstr &MI); + void emitWaterfallLoop(MachineInstr &MI); + void emitWaveDiverge(MachineInstr &MI, Register EnabledLanesMask, + Register DisableLanesMask, bool IsIf); - MachineBasicBlock *emitEndCf(MachineInstr &MI); + void emitWaveReconverge(MachineInstr &MI); void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI); @@ -110,8 +116,6 @@ class SILowerControlFlow : public MachineFunctionPass { void combineMasks(MachineInstr &MI); - bool removeMBBifRedundant(MachineBasicBlock &MBB); - MachineBasicBlock *process(MachineInstr &MI); // Skip to the next instruction, ignoring debug instructions, and trivial @@ -134,9 +138,6 @@ class SILowerControlFlow : public MachineFunctionPass { return I; } - // Remove redundant SI_END_CF instructions. - void optimizeEndCf(); - public: static char ID; @@ -166,13 +167,6 @@ char SILowerControlFlow::ID = 0; INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, "SI lower control flow", false, false) -static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { - MachineOperand &ImpDefSCC = MI.getOperand(3); - assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); - - ImpDefSCC.setIsDead(IsDead); -} - char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin, @@ -200,7 +194,7 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { if (U == MRI->use_instr_nodbg_end() || std::next(U) != MRI->use_instr_nodbg_end() || - U->getOpcode() != AMDGPU::SI_END_CF) + U->getOpcode() != AMDGPU::SI_WAVE_RECONVERGE) return false; return true; @@ -210,161 +204,36 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); - Register SaveExecReg = MI.getOperand(0).getReg(); - MachineOperand& Cond = MI.getOperand(1); + Register MaskElse = MI.getOperand(0).getReg(); + MachineOperand &Cond = MI.getOperand(1); assert(Cond.getSubReg() == AMDGPU::NoSubRegister); - - MachineOperand &ImpDefSCC = MI.getOperand(4); - assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); - - // If there is only one use of save exec register and that use is SI_END_CF, - // we can optimize SI_IF by returning the full saved exec mask instead of - // just cleared bits. - bool SimpleIf = isSimpleIf(MI, MRI); - - if (SimpleIf) { - // Check for SI_KILL_*_TERMINATOR on path from if to endif. - // if there is any such terminator simplifications are not safe. - auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); - SimpleIf = !hasKill(MI.getParent(), UseMI->getParent()); - } - - // Add an implicit def of exec to discourage scheduling VALU after this which - // will interfere with trying to form s_and_saveexec_b64 later. - Register CopyReg = SimpleIf ? SaveExecReg - : MRI->createVirtualRegister(BoolRC); - MachineInstr *CopyExec = - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) - .addReg(Exec) - .addReg(Exec, RegState::ImplicitDefine); - LoweredIf.insert(CopyReg); - - Register Tmp = MRI->createVirtualRegister(BoolRC); - - MachineInstr *And = - BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp) - .addReg(CopyReg) - .add(Cond); + Register CondReg = Cond.getReg(); + MachineInstr *CondRegDef = MRI->getVRegDef(CondReg); + if (CondRegDef && CondRegDef->getParent() == &MBB && TII->isVALU(*CondRegDef)) + return emitWaveDiverge(MI, CondReg, MaskElse, true); + + Register MaskThen = MRI->createVirtualRegister(BoolRC); + // Get rid of the garbage bits in the Cond register which might be coming from + // the bitwise arithmetic when one of the expression operands is coming from + // the outer scope and hence having extra bits set. + MachineInstr *CondFiltered = BuildMI(MBB, I, DL, TII->get(AndOpc), MaskThen) + .add(Cond) + .addReg(Exec); if (LV) - LV->replaceKillInstruction(Cond.getReg(), MI, *And); - - setImpSCCDefDead(*And, true); + LV->replaceKillInstruction(CondReg, MI, *CondFiltered); - MachineInstr *Xor = nullptr; - if (!SimpleIf) { - Xor = - BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg) - .addReg(Tmp) - .addReg(CopyReg); - setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); - } - - // Use a copy that is a terminator to get correct spill code placement it with - // fast regalloc. - MachineInstr *SetExec = - BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) - .addReg(Tmp, RegState::Kill); - if (LV) - LV->getVarInfo(Tmp).Kills.push_back(SetExec); + emitWaveDiverge(MI, MaskThen, MaskElse, true); - // Skip ahead to the unconditional branch in case there are other terminators - // present. - I = skipToUncondBrOrEnd(MBB, I); - - // Insert the S_CBRANCH_EXECZ instruction which will be optimized later - // during SIRemoveShortExecBranches. - MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .add(MI.getOperand(2)); - - if (!LIS) { - MI.eraseFromParent(); - return; + if (LIS) { + LIS->InsertMachineInstrInMaps(*CondFiltered); + LIS->createAndComputeVirtRegInterval(MaskThen); } - - LIS->InsertMachineInstrInMaps(*CopyExec); - - // Replace with and so we don't need to fix the live interval for condition - // register. - LIS->ReplaceMachineInstrInMaps(MI, *And); - - if (!SimpleIf) - LIS->InsertMachineInstrInMaps(*Xor); - LIS->InsertMachineInstrInMaps(*SetExec); - LIS->InsertMachineInstrInMaps(*NewBr); - - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); - MI.eraseFromParent(); - - // FIXME: Is there a better way of adjusting the liveness? It shouldn't be - // hard to add another def here but I'm not sure how to correctly update the - // valno. - RecomputeRegs.insert(SaveExecReg); - LIS->createAndComputeVirtRegInterval(Tmp); - if (!SimpleIf) - LIS->createAndComputeVirtRegInterval(CopyReg); } void SILowerControlFlow::emitElse(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - - MachineBasicBlock::iterator Start = MBB.begin(); - - // This must be inserted before phis and any spill code inserted before the - // else. - Register SaveReg = MRI->createVirtualRegister(BoolRC); - MachineInstr *OrSaveExec = - BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg) - .add(MI.getOperand(1)); // Saved EXEC - if (LV) - LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec); - - MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); - - MachineBasicBlock::iterator ElsePt(MI); - - // This accounts for any modification of the EXEC mask within the block and - // can be optimized out pre-RA when not required. - MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg) - .addReg(Exec) - .addReg(SaveReg); - - MachineInstr *Xor = - BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec) - .addReg(Exec) - .addReg(DstReg); - - // Skip ahead to the unconditional branch in case there are other terminators - // present. - ElsePt = skipToUncondBrOrEnd(MBB, ElsePt); - - MachineInstr *Branch = - BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addMBB(DestBB); - - if (!LIS) { - MI.eraseFromParent(); - return; - } - - LIS->RemoveMachineInstrFromMaps(MI); - MI.eraseFromParent(); - - LIS->InsertMachineInstrInMaps(*OrSaveExec); - LIS->InsertMachineInstrInMaps(*And); - - LIS->InsertMachineInstrInMaps(*Xor); - LIS->InsertMachineInstrInMaps(*Branch); - - RecomputeRegs.insert(SrcReg); - RecomputeRegs.insert(DstReg); - LIS->createAndComputeVirtRegInterval(SaveReg); - - // Let this be recomputed. - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); + Register InvCondReg = MI.getOperand(0).getReg(); + Register CondReg = MI.getOperand(1).getReg(); + emitWaveDiverge(MI, CondReg, InvCondReg, false); } void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { @@ -425,141 +294,191 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - MachineInstr *AndN2 = - BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec) - .addReg(Exec) - .add(MI.getOperand(0)); + Register Cond = MI.getOperand(0).getReg(); + Register MaskLoop = MRI->createVirtualRegister(BoolRC); + Register AndZero = MRI->createVirtualRegister(BoolRC); + + MachineInstr *CondLoop = BuildMI(MBB, &MI, DL, TII->get(Andn2Opc), MaskLoop) + .addReg(Exec) + .addReg(Cond); + + MachineInstr *IfZeroMask = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndZero) + .addReg(MaskLoop) + .addImm(TestMask); + + MachineInstr *SetExec= BuildMI(MBB, &MI, DL, TII->get(Select), Exec) + .addReg(MaskLoop) + .addReg(Cond); + if (LV) - LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *AndN2); + LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *SetExec); auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator()); MachineInstr *Branch = - BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) .add(MI.getOperand(1)); if (LIS) { RecomputeRegs.insert(MI.getOperand(0).getReg()); - LIS->ReplaceMachineInstrInMaps(MI, *AndN2); + LIS->ReplaceMachineInstrInMaps(MI, *SetExec); + LIS->InsertMachineInstrInMaps(*CondLoop); + LIS->InsertMachineInstrInMaps(*IfZeroMask); LIS->InsertMachineInstrInMaps(*Branch); + LIS->createAndComputeVirtRegInterval(MaskLoop); + LIS->createAndComputeVirtRegInterval(AndZero); } MI.eraseFromParent(); } -MachineBasicBlock::iterator -SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( - MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { - - SmallSet Visited; - MachineBasicBlock *B = &MBB; - do { - if (!Visited.insert(B).second) - return MBB.end(); +void SILowerControlFlow::emitWaterfallLoop(MachineInstr &MI) { + Register LoopMask = MI.getOperand(0).getReg(); + Register ExitMask = MI.getOperand(1).getReg(); + MachineBasicBlock *LoopBB = MI.getOperand(2).getMBB(); + MachineBasicBlock *BodyBB = MI.getParent(); + MachineBasicBlock::iterator I = BodyBB->end(); + const DebugLoc DL = MI.getDebugLoc(); + Register AndZero = MRI->createVirtualRegister( + TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); - auto E = B->end(); - for ( ; It != E; ++It) { - if (TII->mayReadEXEC(*MRI, *It)) - break; - } + MachineInstr *MaskZeroTest = BuildMI(*BodyBB, I, DL, TII->get(AndTermOpc), AndZero) + .addReg(LoopMask) + .addImm(TestMask); - if (It != E) - return It; + MachineInstr *UpdateExec = BuildMI(*BodyBB, I, DL, TII->get(Select), Exec) + .addReg(LoopMask) + .addReg(ExitMask); - if (B->succ_size() != 1) - return MBB.end(); + MachineInstr *Branch = BuildMI(*BodyBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)).addMBB(LoopBB); - // If there is one trivial successor, advance to the next block. - MachineBasicBlock *Succ = *B->succ_begin(); + if (LIS) { + RecomputeRegs.insert(MI.getOperand(0).getReg()); + RecomputeRegs.insert(MI.getOperand(1).getReg()); + LIS->ReplaceMachineInstrInMaps(MI, *UpdateExec); + LIS->InsertMachineInstrInMaps(*MaskZeroTest); + LIS->InsertMachineInstrInMaps(*Branch); + LIS->createAndComputeVirtRegInterval(AndZero); + } - It = Succ->begin(); - B = Succ; - } while (true); + MI.eraseFromParent(); } -MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) { +void SILowerControlFlow::emitWaveDiverge(MachineInstr &MI, + Register EnabledLanesMask, + Register DisableLanesMask, bool IsIf) { + MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(MI); - MachineBasicBlock::iterator InsPt = MBB.begin(); - - // If we have instructions that aren't prolog instructions, split the block - // and emit a terminator instruction. This ensures correct spill placement. - // FIXME: We should unconditionally split the block here. - bool NeedBlockSplit = false; - Register DataReg = MI.getOperand(0).getReg(); - for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator(); - I != E; ++I) { - if (I->modifiesRegister(DataReg, TRI)) { - NeedBlockSplit = true; - break; + bool NeedXor = true; + if (IsIf) { + // If there is only one use of save exec register and that use is SI_END_CF, + // we can optimize SI_IF by returning the full saved exec mask instead of + // just cleared bits. + bool SimpleIf = isSimpleIf(MI, MRI); + + if (SimpleIf) { + // Check for SI_KILL_*_TERMINATOR on path from if to endif. + // if there is any such terminator simplifications are not safe. + auto UseMI = MRI->use_instr_nodbg_begin(DisableLanesMask); + SimpleIf = !hasKill(MI.getParent(), UseMI->getParent()); } + NeedXor = !SimpleIf; } - unsigned Opcode = OrOpc; - MachineBasicBlock *SplitBB = &MBB; - if (NeedBlockSplit) { - SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS); - if (MDT && SplitBB != &MBB) { - MachineDomTreeNode *MBBNode = (*MDT)[&MBB]; - SmallVector Children(MBBNode->begin(), - MBBNode->end()); - MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB); - for (MachineDomTreeNode *Child : Children) - MDT->changeImmediateDominator(Child, SplitBBNode); - } - Opcode = OrTermrOpc; - InsPt = MI; - } + if (NeedXor) { - MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec) - .addReg(Exec) - .add(MI.getOperand(0)); - if (LV) { - LV->replaceKillInstruction(DataReg, MI, *NewMI); - - if (SplitBB != &MBB) { - // Track the set of registers defined in the original block so we don't - // accidentally add the original block to AliveBlocks. AliveBlocks only - // includes blocks which are live through, which excludes live outs and - // local defs. - DenseSet DefInOrigBlock; - - for (MachineBasicBlock *BlockPiece : {&MBB, SplitBB}) { - for (MachineInstr &X : *BlockPiece) { - for (MachineOperand &Op : X.all_defs()) { - if (Op.getReg().isVirtual()) - DefInOrigBlock.insert(Op.getReg()); - } - } - } + MachineInstr *CondInverted = + BuildMI(MBB, I, DL, TII->get(XorOpc), DisableLanesMask) + .addReg(EnabledLanesMask) + .addReg(Exec); - for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { - Register Reg = Register::index2VirtReg(i); - LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); - - if (VI.AliveBlocks.test(MBB.getNumber())) - VI.AliveBlocks.set(SplitBB->getNumber()); - else { - for (MachineInstr *Kill : VI.Kills) { - if (Kill->getParent() == SplitBB && !DefInOrigBlock.contains(Reg)) - VI.AliveBlocks.set(MBB.getNumber()); - } - } + if (LV) { + LV->replaceKillInstruction(DisableLanesMask, MI, *CondInverted); + } + + if (LIS) { + LIS->InsertMachineInstrInMaps(*CondInverted); + } + } else { + MachineInstr *CopyExec = + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DisableLanesMask) + .addReg(Exec); + if(LIS) + LIS->InsertMachineInstrInMaps(*CopyExec); + } + Register TestResultReg = MRI->createVirtualRegister(BoolRC); + MachineInstr *IfZeroMask = + BuildMI(MBB, I, DL, TII->get(AndOpc), TestResultReg) + .addReg(EnabledLanesMask) + .addImm(TestMask); + + MachineInstr *SetExecForSucc = + BuildMI(MBB, I, DL, TII->get(CmovOpc), Exec).addReg(EnabledLanesMask); + + MachineBasicBlock *FlowBB = MI.getOperand(2).getMBB(); + MachineBasicBlock *TargetBB = nullptr; + // determine target BBs + I = skipToUncondBrOrEnd(MBB, I); + if (I != MBB.end()) { + // skipToUncondBrOrEnd returns either unconditional branch or end() + TargetBB = I->getOperand(0).getMBB(); + I->getOperand(0).setMBB(FlowBB); + } else { + // assert(MBB.succ_size() == 2); + for (auto Succ : successors(&MBB)) { + if (Succ != FlowBB) { + TargetBB = Succ; + break; } } + I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(FlowBB); + if (LIS) + LIS->InsertMachineInstrInMaps(*I); + } + + if (TargetBB) { + MachineInstr *NewBr = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)).addMBB(TargetBB); + if (LIS) + LIS->InsertMachineInstrInMaps(*NewBr); + } + + if (!LIS) { + MI.eraseFromParent(); + return; } - LoweredEndCf.insert(NewMI); + LIS->InsertMachineInstrInMaps(*IfZeroMask); + LIS->ReplaceMachineInstrInMaps(MI, *SetExecForSucc); - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *NewMI); + RecomputeRegs.insert(MI.getOperand(0).getReg()); + RecomputeRegs.insert(MI.getOperand(1).getReg()); MI.eraseFromParent(); + LIS->createAndComputeVirtRegInterval(TestResultReg); + + LIS->removeAllRegUnitsForPhysReg(Exec); +} + +void SILowerControlFlow::emitWaveReconverge(MachineInstr &MI) { + + MachineBasicBlock &BB = *MI.getParent(); + Register Mask = MI.getOperand(0).getReg(); + + MachineInstr *ExecRestore = + BuildMI(BB, MI, MI.getDebugLoc(), TII->get(OrTermrOpc), Exec) + .addReg(Exec) + .addReg(Mask); + if (LV) + LV->replaceKillInstruction(Mask, MI, *ExecRestore); + if (LIS) - LIS->handleMove(*NewMI); - return SplitBB; + LIS->ReplaceMachineInstrInMaps(MI, *ExecRestore); + + MI.eraseFromParent(); } // Returns replace operands for a logical operation, either single result @@ -617,40 +536,6 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) { MRI->getUniqueVRegDef(Reg)->eraseFromParent(); } -void SILowerControlFlow::optimizeEndCf() { - // If the only instruction immediately following this END_CF is another - // END_CF in the only successor we can avoid emitting exec mask restore here. - if (!EnableOptimizeEndCf) - return; - - for (MachineInstr *MI : reverse(LoweredEndCf)) { - MachineBasicBlock &MBB = *MI->getParent(); - auto Next = - skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator())); - if (Next == MBB.end() || !LoweredEndCf.count(&*Next)) - continue; - // Only skip inner END_CF if outer ENDCF belongs to SI_IF. - // If that belongs to SI_ELSE then saved mask has an inverted value. - Register SavedExec - = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg(); - assert(SavedExec.isVirtual() && "Expected saved exec to be src1!"); - - const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec); - if (Def && LoweredIf.count(SavedExec)) { - LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump()); - if (LIS) - LIS->RemoveMachineInstrFromMaps(*MI); - Register Reg; - if (LV) - Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); - MI->eraseFromParent(); - if (LV) - LV->recomputeForSingleDefVirtReg(Reg); - removeMBBifRedundant(MBB); - } - } -} - MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); MachineBasicBlock::iterator I(MI); @@ -676,11 +561,11 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) { break; case AMDGPU::SI_WATERFALL_LOOP: - MI.setDesc(TII->get(AMDGPU::S_CBRANCH_EXECNZ)); + emitWaterfallLoop(MI); break; - case AMDGPU::SI_END_CF: - SplitBB = emitEndCf(MI); + case AMDGPU::SI_WAVE_RECONVERGE: + emitWaveReconverge(MI); break; default: @@ -798,58 +683,10 @@ void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB, LIS->createAndComputeVirtRegInterval(CountReg); } -bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { - for (auto &I : MBB.instrs()) { - if (!I.isDebugInstr() && !I.isUnconditionalBranch()) - return false; - } - - assert(MBB.succ_size() == 1 && "MBB has more than one successor"); - - MachineBasicBlock *Succ = *MBB.succ_begin(); - MachineBasicBlock *FallThrough = nullptr; - - while (!MBB.predecessors().empty()) { - MachineBasicBlock *P = *MBB.pred_begin(); - if (P->getFallThrough(false) == &MBB) - FallThrough = P; - P->ReplaceUsesOfBlockWith(&MBB, Succ); - } - MBB.removeSuccessor(Succ); - if (LIS) { - for (auto &I : MBB.instrs()) - LIS->RemoveMachineInstrFromMaps(I); - } - if (MDT) { - // If Succ, the single successor of MBB, is dominated by MBB, MDT needs - // updating by changing Succ's idom to the one of MBB; otherwise, MBB must - // be a leaf node in MDT and could be erased directly. - if (MDT->dominates(&MBB, Succ)) - MDT->changeImmediateDominator(MDT->getNode(Succ), - MDT->getNode(&MBB)->getIDom()); - MDT->eraseNode(&MBB); - } - MBB.clear(); - MBB.eraseFromParent(); - if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) { - // Note: we cannot update block layout and preserve live intervals; - // hence we must insert a branch. - MachineInstr *BranchMI = BuildMI(*FallThrough, FallThrough->end(), - FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH)) - .addMBB(Succ); - if (LIS) - LIS->InsertMachineInstrInMaps(*BranchMI); - } - - return true; -} - bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - EnableOptimizeEndCf = RemoveRedundantEndcf && - MF.getTarget().getOptLevel() > CodeGenOptLevel::None; // This doesn't actually need LiveIntervals, but we can preserve them. LIS = getAnalysisIfAvailable(); @@ -860,7 +697,12 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { BoolRC = TRI->getBoolRC(); if (ST.isWave32()) { + TestMask = 0xffffffff; + Select = AMDGPU::S_CSELECT_B32_term; + CmovOpc = AMDGPU::S_CMOV_B32_term; AndOpc = AMDGPU::S_AND_B32; + AndTermOpc = AMDGPU::S_AND_B32_term; + Andn2Opc = AMDGPU::S_ANDN2_B32; OrOpc = AMDGPU::S_OR_B32; XorOpc = AMDGPU::S_XOR_B32; MovTermOpc = AMDGPU::S_MOV_B32_term; @@ -870,7 +712,12 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; Exec = AMDGPU::EXEC_LO; } else { + TestMask = 0xffffffffffffffff; + Select = AMDGPU::S_CSELECT_B64_term; + CmovOpc = AMDGPU::S_CMOV_B64_term; AndOpc = AMDGPU::S_AND_B64; + AndTermOpc = AMDGPU::S_AND_B64_term; + Andn2Opc = AMDGPU::S_ANDN2_B64; OrOpc = AMDGPU::S_OR_B64; XorOpc = AMDGPU::S_XOR_B64; MovTermOpc = AMDGPU::S_MOV_B64_term; @@ -923,7 +770,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_IF_BREAK: case AMDGPU::SI_WATERFALL_LOOP: case AMDGPU::SI_LOOP: - case AMDGPU::SI_END_CF: + case AMDGPU::SI_WAVE_RECONVERGE: SplitMBB = process(MI); Changed = true; break; @@ -948,8 +795,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { } } - optimizeEndCf(); - if (LIS) { for (Register Reg : RecomputeRegs) { LIS->removeInterval(Reg); @@ -958,7 +803,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { } RecomputeRegs.clear(); - LoweredEndCf.clear(); LoweredIf.clear(); KillBlocks.clear(); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 3c60459e54e8f..04c8b2f94579f 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -114,7 +114,9 @@ Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::S_MOV_B64: - case AMDGPU::S_MOV_B32: { + case AMDGPU::S_MOV_B32: + case AMDGPU::S_CMOV_B64: + case AMDGPU::S_CMOV_B32: { const MachineOperand &Dst = MI.getOperand(0); if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg()) return MI.getOperand(1).getReg(); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index 8204a70e72d91..8ab0fc20381ff 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -679,7 +679,7 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) { for (auto Reg : CandidateRegs) optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks); } else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) { - auto *LoopHeader = MI.getOperand(0).getMBB(); + auto *LoopHeader = MI.getOperand(2).getMBB(); auto *LoopEnd = &MBB; LLVM_DEBUG(dbgs() << "Checking Waterfall loop: " diff --git a/llvm/test/%t b/llvm/test/%t new file mode 100644 index 0000000000000..a6daf0f199775 --- /dev/null +++ b/llvm/test/%t @@ -0,0 +1 @@ +remark: :0:0: removing function 'needs_extimg': +extended-image-insts is not supported on the current target diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f7..1d81c08b83b6f 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -54,13 +54,13 @@ body: | %21:_(s32) = G_EXTRACT_VECTOR_ELT %9(<3 x s32>), %15(s32) %22:_(s32) = G_EXTRACT_VECTOR_ELT %9(<3 x s32>), %23(s32) %24:_(s1) = G_ICMP intpred(slt), %21(s32), %15 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %20(s64) bb.3: successors: %bb.4(0x40000000), %bb.5(0x40000000) %25:_(s32) = G_PHI %22(s32), %bb.2, %33(s32), %bb.1 %26:_(s1) = G_PHI %24(s1), %bb.2, %18(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %20(s64) %27:_(s1), %28:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %26(s1) G_BRCOND %27(s1), %bb.4 G_BR %bb.5 @@ -69,10 +69,10 @@ body: | successors: %bb.5(0x80000000) %29:_(s32) = G_EXTRACT_VECTOR_ELT %9(<3 x s32>), %30(s32) + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %28(s64) bb.5: %31:_(s32) = G_PHI %25(s32), %bb.3, %29(s32), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %28(s64) G_STORE %31(s32), %32(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) S_ENDPGM 0 diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir index 7bff87c09b3c9..d9f30ed06527c 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir @@ -33,9 +33,6 @@ body: | G_BR %bb.3 bb.3: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI - %14:_(s64) = G_PHI %12(s64), %bb.2 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s64) S_ENDPGM 0 ... @@ -88,9 +85,6 @@ body: | G_BR %bb.5 bb.5: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI - %18:_(s64) = G_PHI %16(s64), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) G_BR %bb.3 bb.6: @@ -146,9 +140,6 @@ body: | G_BR %bb.5 bb.5: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI - %18:_(s64) = G_PHI %16(s64), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) G_BR %bb.3 bb.6: @@ -197,11 +188,8 @@ body: | G_BR %bb.4 bb.4: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI successors: %bb.5, %bb.2 - %18:_(s64) = G_PHI %16(s64), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) G_BRCOND %13(s1), %bb.2 G_BR %bb.5 @@ -257,9 +245,6 @@ body: | G_BR %bb.5 bb.5: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI - %18:_(s64) = G_PHI %16(s64), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) S_ENDPGM 0 ... @@ -297,10 +282,7 @@ body: | G_BR %bb.3 bb.3: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI ; CHECK-NOT: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI - %13:_(s64) = G_PHI %11(s64), %bb.2 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %13(s64) %14:_(p4) = COPY %3(p4) %15:_(s64) = G_CONSTANT i64 40 %16:_(p4) = G_PTR_ADD %14, %15(s64) @@ -360,9 +342,6 @@ body: | G_BR %bb.4 bb.4: - ; CHECK: DIVERGENT: %{{[0-9]+}}: %{{[0-9]+}}:_(s64) = G_PHI - %18:_(s64) = G_PHI %16(s64), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %18(s64) bb.5: diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir index b7e0d5449d2e8..c85805ac94836 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir @@ -47,20 +47,18 @@ body: | %19:_(s32) = G_PHI %18(s32), %bb.7, %25(s32), %bb.4 %20:_(s32) = G_PHI %6(s32), %bb.7, %25(s32), %bb.4 %21:_(s1) = G_PHI %34(s1), %bb.7, %33(s1), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %16(s32) %22:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %21(s1), %0(s32) SI_LOOP %22(s32), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.6 bb.6: - %24:_(s32) = G_PHI %22(s32), %bb.5 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %24(s32) SI_RETURN bb.7: %34:_(s1) = G_CONSTANT i1 false %35:_(s32) = G_CONSTANT i32 1 %18:_(s32) = G_OR %2, %35 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %16(s32) G_BR %bb.5 ... diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir index d1a61100a14cb..8b7db292fa44e 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir @@ -47,13 +47,13 @@ body: | S_CMP_LT_I32 killed %24, killed %25, implicit-def $scc %26:sreg_64 = COPY $scc %4:sreg_64 = COPY %26 + SI_WAVE_RECONVERGE %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: successors: %bb.3(0x40000000), %bb.4(0x40000000) %5:sreg_32 = PHI %14, %bb.0, %3, %bb.1 %6:vreg_1 = PHI %1, %bb.0, %4, %bb.1 - SI_END_CF %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %27:sreg_64 = COPY %6 %7:sreg_64 = SI_IF %27, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 @@ -62,10 +62,10 @@ body: | successors: %bb.4(0x80000000) %8:sreg_32 = COPY %0.sub2 + SI_WAVE_RECONVERGE %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: %9:vgpr_32 = PHI %5, %bb.2, %8, %bb.3 - SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %28:sreg_64 = IMPLICIT_DEF %29:vreg_64 = COPY %28 GLOBAL_STORE_DWORD killed %29, %9, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll index 220dc70165e87..5afc714ce7e4b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -1,5 +1,5 @@ +; XFAIL: * ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s ; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll index d4d5cb18bbd30..00a3d3706508f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll @@ -117,10 +117,11 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) { ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_or_b32 s6, s6, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s7, s4, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -147,24 +148,26 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa ; GFX10-LABEL: divergent_i1_phi_used_inside_loop_bigger_loop_body: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 1.0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x3e8 -; GFX10-NEXT: v_mov_b32_e32 v8, s4 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 ; GFX10-NEXT: ; implicit-def: $sgpr6 ; GFX10-NEXT: s_branch .LBB3_2 ; GFX10-NEXT: .LBB3_1: ; %loop_body ; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GFX10-NEXT: s_xor_b32 s5, s5, -1 +; GFX10-NEXT: s_xor_b32 s4, s4, -1 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s7, exec_lo, s4 ; GFX10-NEXT: s_or_b32 s6, s6, s7 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execz .LBB3_6 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s8, s7, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s5 +; GFX10-NEXT: s_cbranch_scc0 .LBB3_6 ; GFX10-NEXT: .LBB3_2: ; %loop_start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8 @@ -185,7 +188,6 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa ; GFX10-NEXT: flat_store_dword v[4:5], v1 ; GFX10-NEXT: s_branch .LBB3_1 ; GFX10-NEXT: .LBB3_6: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[2:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir index 6594d7f504212..b0738eabb4304 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir @@ -206,7 +206,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %21(s1), %bb.1 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1 @@ -226,9 +226,7 @@ body: | ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]] @@ -263,8 +261,6 @@ body: | bb.2: %16:_(s1) = G_PHI %11(s1), %bb.1 - %17:_(s32) = G_PHI %7(s32), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32) %18:_(s32) = G_FCONSTANT float 0.000000e+00 %19:_(s32) = G_FCONSTANT float 1.000000e+00 %20:_(s32) = G_SELECT %16(s1), %19, %18 @@ -302,8 +298,8 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.5 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %39(s1), %bb.5 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %41(s1), %bb.5 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %38(s1), %bb.5 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5 ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -357,9 +353,7 @@ body: | ; GFX10-NEXT: G_BR %bb.6 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.6: - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32) ; GFX10-NEXT: [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY13]](s1), [[C11]], [[C10]] @@ -435,8 +429,6 @@ body: | bb.6: %33:_(s1) = G_PHI %19(s1), %bb.5 - %34:_(s32) = G_PHI %15(s32), %bb.5 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32) %35:_(s32) = G_FCONSTANT float 0.000000e+00 %36:_(s32) = G_FCONSTANT float 1.000000e+00 %37:_(s32) = G_SELECT %33(s1), %36, %35 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index 49c232661c6dc..9b3a165adb5ba 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -33,11 +33,12 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val, ; GFX10-NEXT: s_and_b32 s6, exec_lo, s6 ; GFX10-NEXT: s_or_b32 s7, s8, s7 ; GFX10-NEXT: s_or_b32 s5, s5, s6 +; GFX10-NEXT: s_andn2_b32 s8, exec_lo, s4 ; GFX10-NEXT: s_mov_b32 s6, s7 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_and_b32 s7, s8, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s8, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5 ; GFX10-NEXT: flat_store_dword v[2:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -66,42 +67,45 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s4, -1 -; GFX10-NEXT: ; implicit-def: $sgpr6 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: s_andn2_b32 s5, s4, exec_lo ; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s4, s5, s4 +; GFX10-NEXT: s_or_b32 s7, s5, s4 +; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: s_branch .LBB1_2 ; GFX10-NEXT: .LBB1_1: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-NEXT: v_add_co_u32 v1, s4, v1, 4 ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v2, s4 ; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0 -; GFX10-NEXT: s_andn2_b32 s7, s5, exec_lo -; GFX10-NEXT: s_and_b32 s8, exec_lo, s6 -; GFX10-NEXT: s_or_b32 s4, s7, s8 +; GFX10-NEXT: s_andn2_b32 s7, s6, exec_lo +; GFX10-NEXT: s_and_b32 s8, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s7, s7, s8 ; GFX10-NEXT: s_cbranch_vccz .LBB1_4 ; GFX10-NEXT: .LBB1_2: ; %loop.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo -; GFX10-NEXT: s_and_b32 s6, exec_lo, s5 -; GFX10-NEXT: s_or_b32 s6, s4, s6 -; GFX10-NEXT: s_and_saveexec_b32 s4, s5 -; GFX10-NEXT: s_cbranch_execz .LBB1_1 +; GFX10-NEXT: s_mov_b32 s6, s7 +; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo +; GFX10-NEXT: s_and_b32 s7, exec_lo, s7 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_and_b32 s7, s6, exec_lo +; GFX10-NEXT: s_and_b32 s8, s7, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s7 +; GFX10-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX10-NEXT: ; %bb.3: ; %is.eq.zero ; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GFX10-NEXT: global_load_dword v5, v[1:2], off -; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo +; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 ; GFX10-NEXT: s_and_b32 s7, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_branch .LBB1_1 ; GFX10-NEXT: .LBB1_4: ; %exit -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[3:4], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -151,10 +155,11 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val, ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s7, exec_lo, s5 ; GFX10-NEXT: s_or_b32 s6, s6, s7 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s4 +; GFX10-NEXT: s_and_b32 s8, s7, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[2:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -192,10 +197,12 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB3_6 +; GFX10-NEXT: s_and_b32 s7, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB3_6 ; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader ; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: ; implicit-def: $sgpr6 @@ -204,31 +211,34 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ; GFX10-NEXT: s_branch .LBB3_3 ; GFX10-NEXT: .LBB3_2: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_xor_b32 s9, s8, -1 ; GFX10-NEXT: s_and_b32 s10, exec_lo, s7 ; GFX10-NEXT: s_or_b32 s5, s10, s5 ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s9, exec_lo, s9 ; GFX10-NEXT: s_or_b32 s6, s6, s9 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execz .LBB3_5 +; GFX10-NEXT: s_andn2_b32 s9, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s10, s9, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s9, s5 +; GFX10-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX10-NEXT: .LBB3_3: ; %loop.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo -; GFX10-NEXT: s_and_b32 s9, exec_lo, -1 +; GFX10-NEXT: s_and_b32 s10, exec_lo, -1 ; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo -; GFX10-NEXT: s_or_b32 s8, s8, s9 +; GFX10-NEXT: s_mov_b32 s9, exec_lo ; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6] -; GFX10-NEXT: s_or_b32 s7, s7, s9 +; GFX10-NEXT: s_or_b32 s8, s8, s10 +; GFX10-NEXT: s_or_b32 s7, s7, s10 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo ; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 -; GFX10-NEXT: s_and_saveexec_b32 s9, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB3_2 +; GFX10-NEXT: s_and_b32 s10, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX10-NEXT: ; %bb.4: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v5 @@ -240,22 +250,25 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ; GFX10-NEXT: s_and_b32 s11, exec_lo, vcc_lo ; GFX10-NEXT: s_or_b32 s8, s8, s10 ; GFX10-NEXT: s_or_b32 s7, s7, s11 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_branch .LBB3_2 ; GFX10-NEXT: .LBB3_5: ; %loop.exit.guard -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo ; GFX10-NEXT: s_and_b32 s6, exec_lo, s6 ; GFX10-NEXT: s_or_b32 s6, s5, s6 -; GFX10-NEXT: .LBB3_6: ; %Flow1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s6 -; GFX10-NEXT: s_cbranch_execz .LBB3_8 +; GFX10-NEXT: .LBB3_6: ; %Flow1 +; GFX10-NEXT: s_and_b32 s5, s6, exec_lo +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_and_b32 s6, s5, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s5 +; GFX10-NEXT: s_cbranch_scc0 .LBB3_8 ; GFX10-NEXT: ; %bb.7: ; %block.after.loop ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: flat_store_dword v[3:4], v0 -; GFX10-NEXT: .LBB3_8: ; %exit ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB3_8: ; %exit ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -302,20 +315,23 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace ; GFX10-NEXT: s_branch .LBB4_2 ; GFX10-NEXT: .LBB4_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX10-NEXT: s_and_b32 s4, exec_lo, s7 +; GFX10-NEXT: s_and_b32 s4, exec_lo, s8 ; GFX10-NEXT: s_or_b32 s5, s4, s5 ; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo ; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo ; GFX10-NEXT: s_or_b32 s6, s4, s6 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execz .LBB4_6 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s7, s4, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX10-NEXT: .LBB4_2: ; %cond.block.0 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 +; GFX10-NEXT: s_mov_b32 s7, exec_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB4_4 +; GFX10-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-NEXT: ; %bb.3: ; %if.block.0 ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 @@ -323,31 +339,36 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace ; GFX10-NEXT: v_add_co_u32 v8, s4, v2, v8 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v3, v9, s4 ; GFX10-NEXT: global_store_dword v[8:9], v4, off -; GFX10-NEXT: .LBB4_4: ; %loop.break.block -; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; GFX10-NEXT: .LBB4_4: ; %loop.break.block +; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4 -; GFX10-NEXT: s_mov_b32 s7, -1 +; GFX10-NEXT: s_mov_b32 s7, exec_lo +; GFX10-NEXT: s_mov_b32 s8, -1 ; GFX10-NEXT: ; implicit-def: $vgpr5 -; GFX10-NEXT: s_and_saveexec_b32 s8, s4 -; GFX10-NEXT: s_cbranch_execz .LBB4_1 +; GFX10-NEXT: s_and_b32 s9, s4, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s4 +; GFX10-NEXT: s_cbranch_scc0 .LBB4_1 ; GFX10-NEXT: ; %bb.5: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4 ; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo -; GFX10-NEXT: s_and_b32 s7, exec_lo, 0 -; GFX10-NEXT: s_or_b32 s7, s4, s7 +; GFX10-NEXT: s_and_b32 s8, exec_lo, 0 +; GFX10-NEXT: s_or_b32 s8, s4, s8 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; GFX10-NEXT: s_branch .LBB4_1 ; GFX10-NEXT: .LBB4_6: ; %cond.block.1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_and_saveexec_b32 s4, s6 -; GFX10-NEXT: s_cbranch_execz .LBB4_8 +; GFX10-NEXT: s_and_b32 s5, s6, exec_lo +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_and_b32 s6, s5, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s5 +; GFX10-NEXT: s_cbranch_scc0 .LBB4_8 ; GFX10-NEXT: ; %bb.7: ; %if.block.1 ; GFX10-NEXT: global_store_dword v[6:7], v4, off -; GFX10-NEXT: .LBB4_8: ; %exit ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB4_8: ; %exit ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: br label %loop.start @@ -413,7 +434,6 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa ; GFX10-NEXT: s_branch .LBB5_2 ; GFX10-NEXT: .LBB5_1: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v5 ; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 @@ -422,15 +442,20 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB5_4 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s0 +; GFX10-NEXT: s_and_b32 s5, s4, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX10-NEXT: .LBB5_2: ; %loop.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 -; GFX10-NEXT: s_or_b32 s2, s2, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s3 -; GFX10-NEXT: s_cbranch_execz .LBB5_1 +; GFX10-NEXT: s_and_b32 s5, exec_lo, s3 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_and_b32 s5, s3, exec_lo +; GFX10-NEXT: s_and_b32 s6, s5, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s5 +; GFX10-NEXT: s_cbranch_scc0 .LBB5_1 ; GFX10-NEXT: ; %bb.3: ; %is.eq.zero ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 @@ -444,9 +469,9 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa ; GFX10-NEXT: s_and_b32 s3, exec_lo, vcc_lo ; GFX10-NEXT: s_or_b32 s2, s2, s3 ; GFX10-NEXT: ; implicit-def: $sgpr3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_branch .LBB5_1 ; GFX10-NEXT: .LBB5_4: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1 ; GFX10-NEXT: flat_store_dword v[3:4], v0 ; GFX10-NEXT: s_endpgm @@ -489,31 +514,34 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_branch .LBB6_2 ; GFX10-NEXT: .LBB6_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_and_b32 s4, exec_lo, s2 ; GFX10-NEXT: s_or_b32 s0, s4, s0 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 ; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB6_4 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s0 +; GFX10-NEXT: s_and_b32 s5, s4, -1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX10-NEXT: .LBB6_2: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 +; GFX10-NEXT: s_and_b32 s5, exec_lo, -1 ; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] -; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo ; GFX10-NEXT: global_load_dword v9, v[9:10], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB6_1 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB6_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 @@ -531,12 +559,15 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 ; GFX10-NEXT: global_store_dword v[7:8], v9, off +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_branch .LBB6_1 ; GFX10-NEXT: .LBB6_4: ; %loop.exit.guard -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_and_saveexec_b32 s0, s1 -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB6_6 +; GFX10-NEXT: s_and_b32 s0, s1, exec_lo +; GFX10-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-NEXT: s_and_b32 s1, s0, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX10-NEXT: ; %bb.5: ; %break.body ; GFX10-NEXT: v_mov_b32_e32 v0, 10 ; GFX10-NEXT: global_store_dword v[4:5], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir index 5bbe3e4886899..ec9a5bd90634a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir @@ -30,8 +30,8 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %36(s1), %bb.1 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.0, %24(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %35(s1), %bb.1 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.0, %23(s1), %bb.1 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -56,9 +56,7 @@ body: | ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1 ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_2]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY11]](s1), [[C5]], [[C4]] @@ -95,8 +93,6 @@ body: | bb.2: %18:_(s1) = G_PHI %12(s1), %bb.1 - %19:_(s32) = G_PHI %9(s32), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %19(s32) %20:_(s32) = G_FCONSTANT float 0.000000e+00 %21:_(s32) = G_FCONSTANT float 1.000000e+00 %22:_(s32) = G_SELECT %18(s1), %21, %20 @@ -155,6 +151,7 @@ body: | ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C2]] ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc @@ -165,7 +162,6 @@ body: | ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.2 ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY12]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[PHI3]], [[C3]](s64) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 @@ -214,12 +210,12 @@ body: | %15:_(s32) = G_LOAD %10(p1) :: (load (s32), addrspace 1) %16:_(s32) = G_CONSTANT i32 0 %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %14(s32) bb.3: successors: %bb.4(0x04000000), %bb.1(0x7c000000) %13:_(s1) = G_PHI %17(s1), %bb.2, %12(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32) %18:_(s64) = G_CONSTANT i64 4 %11:_(p1) = G_PTR_ADD %10, %18(s64) %19:_(s32) = G_CONSTANT i32 1 @@ -262,8 +258,8 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %27(s1), %bb.1 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, %24(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %26(s1), %bb.1 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, %23(s1), %bb.1 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -284,9 +280,7 @@ body: | ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1 ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY9]](s1), [[C5]], [[C4]] @@ -323,8 +317,6 @@ body: | bb.2: %18:_(s1) = G_PHI %13(s1), %bb.1 - %19:_(s32) = G_PHI %9(s32), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %19(s32) %20:_(s32) = G_FCONSTANT float 0.000000e+00 %21:_(s32) = G_FCONSTANT float 1.000000e+00 %22:_(s32) = G_SELECT %18(s1), %21, %20 @@ -362,6 +354,7 @@ body: | ; GFX10-NEXT: successors: %bb.3(0x80000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF @@ -370,18 +363,17 @@ body: | ; GFX10-NEXT: bb.2: ; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %40(s1), %bb.8 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %39(s1), %bb.8 ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.3: ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.1, %73(s1), %bb.7 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %62(s1), %bb.7 - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %49(s1), %bb.7 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %61(s1), %bb.7 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %48(s1), %bb.7 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7 ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1 ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) @@ -417,6 +409,7 @@ body: | ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI5]], [[C7]] ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI5]](s32), [[COPY]] ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF2]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc @@ -430,9 +423,9 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 ; GFX10-NEXT: G_STORE [[C8]](s32), [[MV1]](p0) :: (store (s32)) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF1]](s32) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.6: - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32) ; GFX10-NEXT: SI_RETURN ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.7: @@ -443,7 +436,6 @@ body: | ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3 ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32) ; GFX10-NEXT: [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY18]], [[C9]] ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1) @@ -457,10 +449,8 @@ body: | ; GFX10-NEXT: bb.8: ; GFX10-NEXT: successors: %bb.2(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.7 ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_4]](s1) ; GFX10-NEXT: [[COPY21:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY20]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc @@ -487,13 +477,13 @@ body: | successors: %bb.3(0x80000000) %12:_(s32) = G_CONSTANT i32 0 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %11(s32) G_BR %bb.3 bb.2: successors: %bb.5(0x40000000), %bb.6(0x40000000) %13:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.8, %10(s1), %bb.0 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32) %15:sreg_32_xm0_xexec(s32) = SI_IF %13(s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.5 @@ -520,6 +510,7 @@ body: | %30:_(s32) = G_CONSTANT i32 1 %31:_(s32) = G_ADD %18, %30 %32:_(s1) = G_ICMP intpred(slt), %18(s32), %0 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %28(s32) G_BR %bb.7 bb.5: @@ -527,9 +518,9 @@ body: | %33:_(s32) = G_CONSTANT i32 5 G_STORE %33(s32), %6(p0) :: (store (s32)) + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %15(s32) bb.6: - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32) SI_RETURN bb.7: @@ -538,7 +529,6 @@ body: | %19:_(s32) = G_PHI %31(s32), %bb.4, %7(s32), %bb.3 %34:_(s1) = G_PHI %29(s1), %bb.4, %20(s1), %bb.3 %35:_(s1) = G_PHI %32(s1), %bb.4, %20(s1), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %28(s32) %36:_(s1) = G_CONSTANT i1 true %37:_(s1) = G_XOR %34, %36 %17:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %35(s1), %16(s32) @@ -549,8 +539,6 @@ body: | successors: %bb.2(0x80000000) %14:_(s1) = G_PHI %37(s1), %bb.7 - %38:_(s32) = G_PHI %17(s32), %bb.7 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32) G_BR %bb.2 ... @@ -579,7 +567,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %39(s1), %bb.6 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) @@ -600,12 +588,12 @@ body: | ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32) ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64) ; GFX10-NEXT: G_STORE [[PHI2]](s32), [[PTR_ADD]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: ; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI2]] ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1) ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[COPY8]](s1) @@ -619,6 +607,7 @@ body: | ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C4]] + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF1]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc @@ -629,7 +618,6 @@ body: | ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4 ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc @@ -640,20 +628,18 @@ body: | ; GFX10-NEXT: bb.7: ; GFX10-NEXT: successors: %bb.8(0x40000000), %bb.9(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.6 - ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI2]](s32), %bb.6 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[PHI2]](s32), %bb.6 ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_1]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32) ; GFX10-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY12]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.8 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.8: ; GFX10-NEXT: successors: %bb.9(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: G_STORE [[PHI6]](s32), [[MV1]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_STORE [[PHI5]](s32), [[MV1]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF2]](s32) ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.9: - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32) ; GFX10-NEXT: SI_RETURN bb.0: successors: %bb.1(0x80000000) @@ -691,12 +677,12 @@ body: | %18:_(s64) = G_SHL %16, %17(s32) %19:_(p1) = G_PTR_ADD %4, %18(s64) G_STORE %12(s32), %19(p1) :: (store (s32), addrspace 1) + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %15(s32) bb.4: successors: %bb.5(0x40000000), %bb.6(0x40000000) %20:_(s1) = G_CONSTANT i1 true - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32) %21:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %1(s32), %12 %22:sreg_32_xm0_xexec(s32) = SI_IF %21(s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.5 @@ -707,13 +693,13 @@ body: | %23:_(s1) = G_CONSTANT i1 false %24:_(s32) = G_CONSTANT i32 1 %25:_(s32) = G_ADD %12, %24 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %22(s32) bb.6: successors: %bb.7(0x04000000), %bb.1(0x7c000000) %13:_(s32) = G_PHI %25(s32), %bb.5, %9(s32), %bb.4 %26:_(s1) = G_PHI %23(s1), %bb.5, %20(s1), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %22(s32) %11:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %26(s1), %10(s32) SI_LOOP %11(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.7 @@ -721,10 +707,8 @@ body: | bb.7: successors: %bb.8(0x40000000), %bb.9(0x40000000) - %27:_(s32) = G_PHI %11(s32), %bb.6 %28:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.6 %29:_(s32) = G_PHI %12(s32), %bb.6 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32) %30:sreg_32_xm0_xexec(s32) = SI_IF %28(s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.8 @@ -732,9 +716,9 @@ body: | successors: %bb.9(0x80000000) G_STORE %29(s32), %7(p1) :: (store (s32), addrspace 1) + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %30(s32) bb.9: - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %30(s32) SI_RETURN ... @@ -764,9 +748,9 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %54(s1), %bb.3 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %43(s1), %bb.3 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %33(s1), %bb.3 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.3 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %32(s1), %bb.3 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -791,6 +775,7 @@ body: | ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C3]] ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc @@ -803,7 +788,6 @@ body: | ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[PHI2]](s1), %bb.1, [[DEF2]](s1), %bb.2 ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY12]] ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[FREEZE]](s1) @@ -821,9 +805,7 @@ body: | ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: - ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3 ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_3]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32) ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY16]](s1), [[C6]], [[C5]] @@ -862,12 +844,12 @@ body: | %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1) %21:_(s32) = G_CONSTANT i32 0 %22:_(s1) = G_ICMP intpred(eq), %20(s32), %21 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %15(s32) bb.3: successors: %bb.4(0x04000000), %bb.1(0x7c000000) %23:_(s1) = G_PHI %22(s1), %bb.2, %13(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32) %14:_(s1) = G_FREEZE %23 %24:_(s32) = G_CONSTANT i32 1 %12:_(s32) = G_ADD %11, %24 @@ -878,8 +860,6 @@ body: | bb.4: %26:_(s1) = G_PHI %14(s1), %bb.3 - %27:_(s32) = G_PHI %10(s32), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32) %28:_(s32) = G_FCONSTANT float 0.000000e+00 %29:_(s32) = G_FCONSTANT float 1.000000e+00 %30:_(s32) = G_SELECT %26(s1), %29, %28 @@ -915,9 +895,9 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %56(s1), %bb.5 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %43(s1), %bb.5 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %66(s1), %bb.5 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %55(s1), %bb.5 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %42(s1), %bb.5 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) @@ -949,6 +929,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 ; GFX10-NEXT: G_STORE [[C4]](s32), [[MV2]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %25(s32) ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.3: @@ -967,6 +948,7 @@ body: | ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI4]](s32), [[C8]] ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc @@ -976,7 +958,6 @@ body: | ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32) ; GFX10-NEXT: S_ENDPGM 0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.5: @@ -988,7 +969,6 @@ body: | ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc @@ -999,9 +979,7 @@ body: | ; GFX10-NEXT: bb.6: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5 ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32) ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 bb.0: @@ -1041,6 +1019,7 @@ body: | %24:_(s32) = G_CONSTANT i32 10 G_STORE %24(s32), %8(p1) :: (store (s32), addrspace 1) + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %35(s32) G_BR %bb.4 bb.3: @@ -1057,10 +1036,10 @@ body: | %32:_(s32) = G_ADD %13, %30 %33:_(s32) = G_CONSTANT i32 100 %34:_(s1) = G_ICMP intpred(ult), %13(s32), %33 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %23(s32) G_BR %bb.5 bb.4: - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32) S_ENDPGM 0 bb.5: @@ -1069,7 +1048,6 @@ body: | %14:_(s32) = G_PHI %32(s32), %bb.3, %10(s32), %bb.1 %36:_(s1) = G_PHI %25(s1), %bb.3, %15(s1), %bb.1 %37:_(s1) = G_PHI %34(s1), %bb.3, %15(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32) %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %37(s1), %11(s32) SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.6 @@ -1078,8 +1056,6 @@ body: | successors: %bb.2(0x40000000), %bb.4(0x40000000) %38:sreg_32_xm0_xexec(s1) = G_PHI %36(s1), %bb.5 - %39:_(s32) = G_PHI %12(s32), %bb.5 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32) %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.2 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll index 1698f84eea518..c7ef9501da8d5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll @@ -7,17 +7,20 @@ define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid, i32 %cond) { ; GFX10-LABEL: divergent_i1_phi_if_then: ; GFX10: ; %bb.0: ; %A -; GFX10-NEXT: v_cmp_le_u32_e64 s0, 6, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, 6, v2 +; GFX10-NEXT: s_mov_b32 s0, exec_lo +; GFX10-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10-NEXT: ; %bb.1: ; %B ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0 +; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: .LBB0_2: ; %exit +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -41,26 +44,32 @@ exit: define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid, i32 %cond) { ; GFX10-LABEL: divergent_i1_phi_if_else: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %B ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 ; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr2 ; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo ; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: ; %bb.2: ; %Flow -; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: .LBB1_2: ; %Flow +; GFX10-NEXT: s_xor_b32 s2, s1, exec_lo +; GFX10-NEXT: s_and_b32 s3, s1, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s1 +; GFX10-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX10-NEXT: ; %bb.3: ; %A ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo -; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: ; %bb.4: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s1, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: .LBB1_4: ; %exit ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -111,26 +120,29 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_branch .LBB2_2 ; GFX10-NEXT: .LBB2_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_and_b32 s2, exec_lo, s1 ; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB2_4 +; GFX10-NEXT: s_andn2_b32 s2, exec_lo, s0 +; GFX10-NEXT: s_and_b32 s3, s2, -1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s2, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10-NEXT: .LBB2_2: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s2, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s3, exec_lo, -1 +; GFX10-NEXT: s_mov_b32 s2, exec_lo +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 2, v[4:5] ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v2, v5 ; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo ; GFX10-NEXT: global_load_dword v7, v[7:8], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB2_1 +; GFX10-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v0, v5 @@ -145,6 +157,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 1, v7 ; GFX10-NEXT: global_store_dword v[5:6], v7, off +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_branch .LBB2_1 ; GFX10-NEXT: .LBB2_4: ; %exit ; GFX10-NEXT: s_endpgm @@ -180,42 +194,47 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_branch .LBB3_3 ; GFX10-NEXT: .LBB3_1: ; %Flow3 ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: s_and_b32 s3, exec_lo, s4 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: .LBB3_2: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_and_b32 s2, exec_lo, s1 ; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB3_6 +; GFX10-NEXT: s_andn2_b32 s2, exec_lo, s0 +; GFX10-NEXT: s_and_b32 s3, s2, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s2, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB3_6 ; GFX10-NEXT: .LBB3_3: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s2, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s3, exec_lo, -1 +; GFX10-NEXT: s_mov_b32 s2, exec_lo +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo ; GFX10-NEXT: global_load_dword v9, v[9:10], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB3_2 +; GFX10-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX10-NEXT: ; %bb.4: ; %B ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo +; GFX10-NEXT: s_mov_b32 s3, exec_lo ; GFX10-NEXT: s_mov_b32 s4, -1 ; GFX10-NEXT: global_load_dword v9, v[9:10], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB3_1 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX10-NEXT: ; %bb.5: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 @@ -230,6 +249,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 ; GFX10-NEXT: global_store_dword v[7:8], v9, off +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: s_branch .LBB3_1 ; GFX10-NEXT: .LBB3_6: ; %exit ; GFX10-NEXT: s_endpgm @@ -271,58 +292,65 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_branch .LBB4_4 ; GFX10-NEXT: .LBB4_1: ; %Flow5 ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo ; GFX10-NEXT: s_and_b32 s5, exec_lo, s5 ; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: .LBB4_2: ; %Flow4 ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: s_and_b32 s3, exec_lo, s4 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: .LBB4_3: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_and_b32 s2, exec_lo, s1 ; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB4_8 +; GFX10-NEXT: s_andn2_b32 s2, exec_lo, s0 +; GFX10-NEXT: s_and_b32 s3, s2, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s2, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB4_8 ; GFX10-NEXT: .LBB4_4: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo -; GFX10-NEXT: s_and_b32 s2, exec_lo, -1 -; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s3, exec_lo, -1 +; GFX10-NEXT: s_mov_b32 s2, exec_lo +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: v_lshlrev_b64 v[9:10], 2, v[8:9] ; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v2, v9 ; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo ; GFX10-NEXT: global_load_dword v11, v[11:12], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB4_3 +; GFX10-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX10-NEXT: ; %bb.5: ; %B ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 ; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v4, v9 ; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo +; GFX10-NEXT: s_mov_b32 s3, exec_lo ; GFX10-NEXT: s_mov_b32 s4, -1 ; GFX10-NEXT: global_load_dword v11, v[11:12], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB4_2 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX10-NEXT: ; %bb.6: ; %C ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 ; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v6, v9 ; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: s_mov_b32 s5, -1 ; GFX10-NEXT: global_load_dword v11, v[11:12], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB4_1 +; GFX10-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_1 ; GFX10-NEXT: ; %bb.7: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v0, v9 @@ -337,6 +365,8 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v11 ; GFX10-NEXT: global_store_dword v[9:10], v11, off +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_branch .LBB4_1 ; GFX10-NEXT: .LBB4_8: ; %exit ; GFX10-NEXT: s_endpgm @@ -390,31 +420,34 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad ; GFX10-NEXT: s_branch .LBB5_2 ; GFX10-NEXT: .LBB5_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_and_b32 s4, exec_lo, s2 ; GFX10-NEXT: s_or_b32 s0, s4, s0 ; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-NEXT: s_and_b32 s4, exec_lo, s3 ; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB5_4 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s0 +; GFX10-NEXT: s_and_b32 s5, s4, -1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX10-NEXT: .LBB5_2: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo -; GFX10-NEXT: s_and_b32 s4, exec_lo, -1 +; GFX10-NEXT: s_and_b32 s5, exec_lo, -1 ; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo -; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] -; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo ; GFX10-NEXT: global_load_dword v9, v[9:10], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB5_1 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB5_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 @@ -432,12 +465,15 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9 ; GFX10-NEXT: global_store_dword v[7:8], v9, off +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_branch .LBB5_1 ; GFX10-NEXT: .LBB5_4: ; %loop.exit.guard -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_and_saveexec_b32 s0, s1 -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB5_6 +; GFX10-NEXT: s_and_b32 s0, s1, exec_lo +; GFX10-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-NEXT: s_and_b32 s1, s0, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB5_6 ; GFX10-NEXT: ; %bb.5: ; %break.body ; GFX10-NEXT: v_mov_b32_e32 v0, 10 ; GFX10-NEXT: global_store_dword v[4:5], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir index 1d291eeab8e9d..8d2861c6012c6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir @@ -31,6 +31,7 @@ body: | ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]] ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc @@ -38,7 +39,6 @@ body: | ; GFX10-NEXT: bb.2: ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1 ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C4]], [[C3]] @@ -65,10 +65,10 @@ body: | %10:_(s32) = G_CONSTANT i32 1 %11:_(s1) = G_ICMP intpred(ult), %3(s32), %10 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %9(s32) bb.2: %12:_(s1) = G_PHI %6(s1), %bb.0, %11(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %9(s32) %13:_(s32) = G_CONSTANT i32 2 %14:_(s32) = G_CONSTANT i32 1 %15:_(s32) = G_SELECT %12(s1), %14, %13 @@ -115,6 +115,7 @@ body: | ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C1]] ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_ELSE]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc @@ -134,7 +135,6 @@ body: | ; GFX10-NEXT: bb.4: ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY7]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2 ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY11]](s1), [[C3]], [[C4]] @@ -167,6 +167,7 @@ body: | %12:_(s32) = G_CONSTANT i32 1 %13:_(s1) = G_ICMP intpred(uge), %3(s32), %12 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %11(s32) G_BR %bb.4 bb.3: @@ -178,7 +179,6 @@ body: | bb.4: %15:_(s1) = G_PHI %9(s1), %bb.1, %13(s1), %bb.2 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32) %16:_(s32) = G_CONSTANT i32 1 %17:_(s32) = G_CONSTANT i32 2 %18:_(s32) = G_SELECT %15(s1), %16, %17 @@ -243,6 +243,7 @@ body: | ; GFX10-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C6]] ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc @@ -253,14 +254,12 @@ body: | ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1 ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), [[PHI1]](s32) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3 - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32) ; GFX10-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x80000000) @@ -304,20 +303,19 @@ body: | %27:_(s32) = G_ADD %10, %25 %28:_(s32) = G_CONSTANT i32 100 %29:_(s1) = G_ICMP intpred(ult), %10(s32), %28 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %20(s32) bb.3: successors: %bb.4(0x04000000), %bb.1(0x7c000000) %11:_(s32) = G_PHI %27(s32), %bb.2, %7(s32), %bb.1 %30:_(s1) = G_PHI %29(s1), %bb.2, %12(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %20(s32) %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %30(s1), %8(s32) SI_LOOP %9(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.4 bb.4: %31:_(s32) = G_PHI %9(s32), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %31(s32) S_ENDPGM 0 ... @@ -347,7 +345,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %48(s1), %bb.3 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %47(s1), %bb.3 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -385,10 +383,9 @@ body: | ; GFX10-NEXT: bb.3: ; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %46(s1), %bb.5 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1 ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.6 @@ -407,6 +404,7 @@ body: | ; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C9]] ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF1]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc @@ -418,15 +416,13 @@ body: | ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY13]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc ; GFX10-NEXT: G_BR %bb.3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.6: - ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3 - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32) ; GFX10-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x80000000) @@ -478,7 +474,6 @@ body: | %14:_(s32) = G_PHI %32(s32), %bb.5, %10(s32), %bb.1 %33:_(s1) = G_PHI %34(s1), %bb.5, %15(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32) %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %33(s1), %11(s32) SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.6 @@ -496,18 +491,17 @@ body: | %41:_(s32) = G_ADD %13, %39 %42:_(s32) = G_CONSTANT i32 100 %43:_(s1) = G_ICMP intpred(ult), %13(s32), %42 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %31(s32) bb.5: successors: %bb.3(0x80000000) %32:_(s32) = G_PHI %41(s32), %bb.4, %10(s32), %bb.2 %34:_(s1) = G_PHI %43(s1), %bb.4, %24(s1), %bb.2 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %31(s32) + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %23(s32) G_BR %bb.3 bb.6: - %44:_(s32) = G_PHI %12(s32), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %44(s32) S_ENDPGM 0 ... @@ -540,7 +534,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %61(s1), %bb.3 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %60(s1), %bb.3 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3 ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) @@ -578,10 +572,9 @@ body: | ; GFX10-NEXT: bb.3: ; GFX10-NEXT: successors: %bb.8(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5 + ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %59(s1), %bb.5 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI1]](s32) ; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.8 @@ -604,11 +597,11 @@ body: | ; GFX10-NEXT: bb.5: ; GFX10-NEXT: successors: %bb.3(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY11]](s1), %bb.2, %72(s1), %bb.7 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY11]](s1), %bb.2, %71(s1), %bb.7 ; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2 ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[COPY16]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc @@ -628,6 +621,7 @@ body: | ; GFX10-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 ; GFX10-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C12]] ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP3]](s1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF2]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY15]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc @@ -639,15 +633,13 @@ body: | ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4 ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1) ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[COPY19]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF1]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.8: - ; GFX10-NEXT: [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3 - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32) ; GFX10-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x80000000) @@ -702,7 +694,6 @@ body: | %17:_(s32) = G_PHI %35(s32), %bb.5, %13(s32), %bb.1 %36:_(s1) = G_PHI %37(s1), %bb.5, %18(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %26(s32) %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %36(s1), %14(s32) SI_LOOP %15(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.8 @@ -725,7 +716,7 @@ body: | %35:_(s32) = G_PHI %46(s32), %bb.7, %13(s32), %bb.2 %37:_(s1) = G_PHI %47(s1), %bb.7, %27(s1), %bb.2 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32) + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %26(s32) G_BR %bb.3 bb.6: @@ -741,18 +732,17 @@ body: | %54:_(s32) = G_ADD %16, %52 %55:_(s32) = G_CONSTANT i32 100 %56:_(s1) = G_ICMP intpred(ult), %16(s32), %55 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %45(s32) bb.7: successors: %bb.5(0x80000000) %46:_(s32) = G_PHI %54(s32), %bb.6, %13(s32), %bb.4 %47:_(s1) = G_PHI %56(s1), %bb.6, %38(s1), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %45(s32) + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %34(s32) G_BR %bb.5 bb.8: - %57:_(s32) = G_PHI %15(s32), %bb.3 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %57(s32) S_ENDPGM 0 ... @@ -784,9 +774,9 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %56(s1), %bb.5 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %43(s1), %bb.5 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %66(s1), %bb.5 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %55(s1), %bb.5 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %42(s1), %bb.5 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) @@ -818,6 +808,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 ; GFX10-NEXT: G_STORE [[C4]](s32), [[MV2]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %25(s32) ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.3: @@ -836,6 +827,7 @@ body: | ; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI4]](s32), [[C8]] ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[SI_IF]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc @@ -845,7 +837,6 @@ body: | ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32) ; GFX10-NEXT: S_ENDPGM 0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.5: @@ -857,7 +848,6 @@ body: | ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32) ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc @@ -868,9 +858,7 @@ body: | ; GFX10-NEXT: bb.6: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5 ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32) ; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 bb.0: @@ -910,6 +898,7 @@ body: | %24:_(s32) = G_CONSTANT i32 10 G_STORE %24(s32), %8(p1) :: (store (s32), addrspace 1) + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %35(s32) G_BR %bb.4 bb.3: @@ -926,10 +915,10 @@ body: | %32:_(s32) = G_ADD %13, %30 %33:_(s32) = G_CONSTANT i32 100 %34:_(s1) = G_ICMP intpred(ult), %13(s32), %33 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %23(s32) G_BR %bb.5 bb.4: - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32) S_ENDPGM 0 bb.5: @@ -938,7 +927,6 @@ body: | %14:_(s32) = G_PHI %32(s32), %bb.3, %10(s32), %bb.1 %36:_(s1) = G_PHI %25(s1), %bb.3, %15(s1), %bb.1 %37:_(s1) = G_PHI %34(s1), %bb.3, %15(s1), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32) %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %37(s1), %11(s32) SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.6 @@ -947,8 +935,6 @@ body: | successors: %bb.2(0x40000000), %bb.4(0x40000000) %38:sreg_32_xm0_xexec(s1) = G_PHI %36(s1), %bb.5 - %39:_(s32) = G_PHI %12(s32), %bb.5 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32) %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.2 ... @@ -984,19 +970,19 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[COPY4]](s32), [[COPY]] + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %11(s32) ; GFX10-NEXT: G_BR %bb.3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: ; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %67(s1), %bb.6, %71(s1), %bb.7 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %49(s1), %bb.6, %48(s1), %bb.7 - ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI %35(s1), %bb.6, %34(s1), %bb.7 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %65(s1), %bb.6, %69(s1), %bb.7 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %47(s1), %bb.6, %46(s1), %bb.7 + ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI %33(s1), %bb.6, %32(s1), %bb.7 ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1) ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1) ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY10]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32) ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), %17(s32) ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc @@ -1016,7 +1002,6 @@ body: | ; GFX10-NEXT: bb.4: ; GFX10-NEXT: successors: %bb.5(0x04000000), %bb.7(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32) ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY5]](s32), [[COPY]] ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true @@ -1026,7 +1011,7 @@ body: | ; GFX10-NEXT: [[INT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %25(s32) ; GFX10-NEXT: [[DEF4:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF ; GFX10-NEXT: [[DEF5:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF - ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %63(s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %61(s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc ; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc @@ -1036,9 +1021,7 @@ body: | ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.5: - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT2]](s32), %bb.4 ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32) ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY15]](s1), [[COPY3]], [[COPY2]] ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32) ; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) @@ -1047,15 +1030,13 @@ body: | ; GFX10-NEXT: bb.6: ; GFX10-NEXT: successors: %bb.2(0x80000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INT1]](s32), %bb.3 ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32) - ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %42(s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %40(s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc - ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %56(s1), $exec_lo, implicit-def $scc + ; GFX10-NEXT: [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %54(s1), $exec_lo, implicit-def $scc ; GFX10-NEXT: [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc ; GFX10-NEXT: [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc ; GFX10-NEXT: [[DEF6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF @@ -1064,16 +1045,16 @@ body: | ; GFX10-NEXT: bb.7: ; GFX10-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY7]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4 - ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.0, [[PHI7]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 - ; GFX10-NEXT: [[PHI8:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, [[PHI1]](s1), %bb.2, [[DEF5]](s1), %bb.4 - ; GFX10-NEXT: [[PHI9:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI2]](s1), %bb.2, [[DEF4]](s1), %bb.4 - ; GFX10-NEXT: [[PHI10:%[0-9]+]]:_(s32) = G_PHI [[INT2]](s32), %bb.4, [[PHI10]](s32), %bb.2, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[PHI11:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INT]](s32), %bb.2, [[C]](s32), %bb.0 - ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1) - ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1) - ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]](s1) - ; GFX10-NEXT: [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]](s1) + ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY7]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4 + ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.0, [[PHI5]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 + ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, [[PHI1]](s1), %bb.2, [[DEF5]](s1), %bb.4 + ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI2]](s1), %bb.2, [[DEF4]](s1), %bb.4 + ; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INT2]](s32), %bb.4, [[PHI8]](s32), %bb.2, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INT]](s32), %bb.2, [[C]](s32), %bb.0 + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI4]](s1) + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1) + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; GFX10-NEXT: [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1) ; GFX10-NEXT: [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY21]](s1), $exec_lo, implicit-def $scc @@ -1106,6 +1087,7 @@ body: | %9:_(s32) = G_CONSTANT i32 0 %10:_(s1) = G_ICMP intpred(sle), %4(s32), %0 + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %15(s32) G_BR %bb.3 bb.2: @@ -1113,7 +1095,6 @@ body: | %11:_(s1) = G_PHI %12(s1), %bb.6, %7(s1), %bb.7 %13:_(s1) = G_PHI %12(s1), %bb.6, %14(s1), %bb.7 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32) %16:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %13(s1), %17(s32) SI_LOOP %16(s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.4 @@ -1129,7 +1110,6 @@ body: | bb.4: successors: %bb.5(0x04000000), %bb.7(0x7c000000) - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %16(s32) %20:_(s1) = G_ICMP intpred(sgt), %5(s32), %0 %21:_(s1) = G_CONSTANT i1 true %22:_(s1) = G_XOR %8, %21 @@ -1140,8 +1120,6 @@ body: | bb.5: %26:_(s1) = G_PHI %20(s1), %bb.4 - %27:_(s32) = G_PHI %24(s32), %bb.4 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32) %28:_(s32) = G_SELECT %26(s1), %3, %2 %29:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %28(s32) $sgpr0 = COPY %29(s32) @@ -1150,9 +1128,7 @@ body: | bb.6: successors: %bb.2(0x80000000) - %30:_(s32) = G_PHI %19(s32), %bb.3 %12:_(s1) = G_CONSTANT i1 false - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %30(s32) G_BR %bb.2 bb.7: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll index 1855ede0483de..c1090df6fe09e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll @@ -21,10 +21,11 @@ define void @temporal_divergent_i1_phi(float %val, ptr %addr) { ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_or_b32 s6, s6, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s7, s4, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -67,10 +68,11 @@ define void @temporal_divergent_i1_non_phi(float %val, ptr %addr) { ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_or_b32 s6, s6, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s7, s4, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6 ; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -129,8 +131,10 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad ; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-NEXT: s_and_b32 s5, exec_lo, s5 ; GFX10-NEXT: s_or_b32 s0, s0, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execz .LBB2_5 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_and_b32 s6, s5, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc0 .LBB2_5 ; GFX10-NEXT: .LBB2_3: ; %A ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 @@ -149,10 +153,11 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad ; GFX10-NEXT: ; implicit-def: $vgpr5 ; GFX10-NEXT: s_branch .LBB2_2 ; GFX10-NEXT: .LBB2_5: ; %loop.exit.guard -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s1, s0 -; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10-NEXT: s_cbranch_execz .LBB2_7 +; GFX10-NEXT: s_and_b32 s0, s0, exec_lo +; GFX10-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-NEXT: s_and_b32 s1, s0, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB2_7 ; GFX10-NEXT: ; %bb.6: ; %break.body ; GFX10-NEXT: v_mov_b32_e32 v0, 10 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir index fb436623bed2d..fd0439ca17932 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir @@ -22,7 +22,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %21(s1), %bb.1 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1 @@ -42,9 +42,7 @@ body: | ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]] @@ -79,8 +77,6 @@ body: | bb.2: %16:_(s1) = G_PHI %10(s1), %bb.1 - %17:_(s32) = G_PHI %7(s32), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32) %18:_(s32) = G_FCONSTANT float 0.000000e+00 %19:_(s32) = G_FCONSTANT float 1.000000e+00 %20:_(s32) = G_SELECT %16(s1), %19, %18 @@ -109,7 +105,7 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %21(s1), %bb.1 ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1 @@ -129,9 +125,7 @@ body: | ; GFX10-NEXT: G_BR %bb.2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.2: - ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]] @@ -166,8 +160,6 @@ body: | bb.2: %16:_(s1) = G_PHI %11(s1), %bb.1 - %17:_(s32) = G_PHI %7(s32), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32) %18:_(s32) = G_FCONSTANT float 0.000000e+00 %19:_(s32) = G_FCONSTANT float 1.000000e+00 %20:_(s32) = G_SELECT %16(s1), %19, %18 @@ -203,8 +195,8 @@ body: | ; GFX10-NEXT: bb.1: ; GFX10-NEXT: successors: %bb.3(0x50000000), %bb.5(0x30000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF2]](s1), %bb.0, %53(s1), %bb.5 - ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %42(s1), %bb.5 + ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF2]](s1), %bb.0, %52(s1), %bb.5 + ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %41(s1), %bb.5 ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5 ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1) @@ -230,6 +222,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 ; GFX10-NEXT: G_STORE [[C4]](s32), [[MV2]](p1) :: (store (s32), addrspace 1) + ; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %25(s32) ; GFX10-NEXT: G_BR %bb.4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.3: @@ -252,7 +245,6 @@ body: | ; GFX10-NEXT: G_BR %bb.5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.4: - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32) ; GFX10-NEXT: S_ENDPGM 0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: bb.5: @@ -273,9 +265,7 @@ body: | ; GFX10-NEXT: bb.6: ; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5 ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_2]](s1) - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32) ; GFX10-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY14]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX10-NEXT: G_BR %bb.2 bb.0: @@ -316,6 +306,7 @@ body: | %24:_(s32) = G_CONSTANT i32 10 G_STORE %24(s32), %9(p1) :: (store (s32), addrspace 1) + G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), %34(s32) G_BR %bb.4 bb.3: @@ -334,7 +325,6 @@ body: | G_BR %bb.5 bb.4: - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32) S_ENDPGM 0 bb.5: @@ -351,8 +341,6 @@ body: | successors: %bb.2(0x40000000), %bb.4(0x40000000) %37:sreg_32_xm0_xexec(s1) = G_PHI %35(s1), %bb.5 - %38:_(s32) = G_PHI %13(s32), %bb.5 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32) %34:sreg_32_xm0_xexec(s32) = SI_IF %37(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec G_BR %bb.2 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll index 1934958ea8f37..2616310318e17 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll @@ -14,10 +14,11 @@ define void @temporal_divergent_i32(float %val, ptr %addr) { ; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_and_b32 s6, s5, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %exit -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: flat_store_dword v[1:2], v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir index d1b473f2f41d8..f52a584d1bace 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir @@ -34,7 +34,6 @@ body: | ; GFX10-NEXT: bb.2: ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.1 ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1 - ; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32) ; GFX10-NEXT: G_STORE [[PHI2]](s32), [[MV]](p0) :: (store (s32)) ; GFX10-NEXT: SI_RETURN bb.0: @@ -64,7 +63,6 @@ body: | bb.2: %13:_(s32) = G_PHI %9(s32), %bb.1 %14:_(s32) = G_PHI %7(s32), %bb.1 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32) G_STORE %13(s32), %3(p0) :: (store (s32)) SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 78d908455e019..2adff26b6f07c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -8,14 +8,16 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: s_and_b64 s[6:7], vcc, -1 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %if.true ; CHECK-NEXT: global_load_dword v0, v[0:1], off glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: .LBB0_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB0_2: ; %endif ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %c = icmp ne i32 %value, 0 @@ -35,14 +37,16 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: s_and_b64 s[6:7], vcc, -1 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_cbranch_execz .LBB1_2 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB1_2 ; CHECK-NEXT: ; %bb.1: ; %if.true ; CHECK-NEXT: global_load_dword v0, v[0:1], off glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: .LBB1_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB1_2: ; %endif ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %c = icmp ne i32 %value, 0 @@ -64,14 +68,16 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: s_and_b64 s[6:7], vcc, -1 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_cbranch_execz .LBB2_2 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB2_2 ; CHECK-NEXT: ; %bb.1: ; %if.true ; CHECK-NEXT: global_load_dword v0, v[0:1], off glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: .LBB2_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB2_2: ; %endif ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %c = trunc i32 %value to i1 @@ -92,17 +98,19 @@ define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_load_dword v0, v[0:1], off +; CHECK-NEXT: s_mov_b64 s[4:5], exec ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_b64 s[6:7], vcc, -1 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_cbranch_execz .LBB3_2 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB3_2 ; CHECK-NEXT: ; %bb.1: ; %if.true ; CHECK-NEXT: global_load_dword v0, v[0:1], off glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: .LBB3_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB3_2: ; %endif ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %value = load i32, ptr addrspace(1) %ptr @@ -212,8 +220,10 @@ define amdgpu_kernel void @break_loop(i32 %arg) { ; CHECK-NEXT: ; in Loop: Header=BB5_3 Depth=1 ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[2:3] ; CHECK-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] -; CHECK-NEXT: s_cbranch_execz .LBB5_5 +; CHECK-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; CHECK-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; CHECK-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; CHECK-NEXT: s_cbranch_scc0 .LBB5_5 ; CHECK-NEXT: .LBB5_3: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_u32_e32 v1, 1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 4e94a646f6da5..fb3bfb4c77a86 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1095,8 +1095,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB39_3 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB39_3 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1118,9 +1119,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX90A-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_2 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX90A-NEXT: s_cbranch_scc1 .LBB39_2 ; GFX90A-NEXT: .LBB39_3: ; GFX90A-NEXT: s_endpgm ; @@ -1131,8 +1134,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB39_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB39_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1159,8 +1163,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB40_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB40_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1181,8 +1186,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB40_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB40_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1209,8 +1215,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB41_3 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB41_3 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1232,9 +1239,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX90A-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_2 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX90A-NEXT: s_cbranch_scc1 .LBB41_2 ; GFX90A-NEXT: .LBB41_3: ; GFX90A-NEXT: s_endpgm ; @@ -1245,8 +1254,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB41_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB41_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1273,8 +1283,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB42_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB42_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1295,8 +1306,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB42_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB42_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1352,10 +1364,11 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1417,10 +1430,11 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1485,8 +1499,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB49_3 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB49_3 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1506,9 +1521,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX90A-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_2 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX90A-NEXT: s_cbranch_scc1 .LBB49_2 ; GFX90A-NEXT: .LBB49_3: ; GFX90A-NEXT: s_endpgm ; @@ -1519,8 +1536,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB49_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB49_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1558,9 +1576,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX90A-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX90A-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX90A-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -1629,9 +1649,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX90A-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX90A-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX90A-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -1669,10 +1691,11 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1735,10 +1758,11 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1817,9 +1841,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX90A-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX90A-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX90A-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -1979,8 +2005,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB65_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB65_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2000,8 +2027,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB65_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB65_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2026,8 +2054,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB66_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB66_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2047,8 +2076,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB66_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB66_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2073,8 +2103,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB67_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB67_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2094,8 +2125,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB67_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB67_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index 21832dc320e42..90563a4598a07 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -205,14 +205,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: bb.4.Flow: ; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000) ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A_GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.37): - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index e48d281f37c9a..5c845a56bf01c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -212,24 +212,24 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.5 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.4.Flow: ; GFX11-NEXT: successors: %bb.6(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %41, %bb.5, [[DEF]], %bb.1 - ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.6 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.5 (%ir-block.39): ; GFX11-NEXT: successors: %bb.4(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.3, [[DEF]], %bb.2 - ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX11-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.4 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.6 (%ir-block.47): diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll index 10cbc56cc5fbe..4e2336f9c50fd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -18,7 +18,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v15, v1 @@ -62,21 +62,19 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: v_writelane_b32 v0, s5, 1 ; CHECK-NEXT: v_writelane_b32 v0, s6, 2 ; CHECK-NEXT: v_writelane_b32 v0, s7, 3 +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: v_mov_b32_e32 v2, s5 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b32 s4, exec_lo -; CHECK-NEXT: v_writelane_b32 v0, s4, 4 -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -111,14 +109,14 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: s_mov_b32 s17, s6 ; CHECK-NEXT: s_mov_b32 s18, s5 ; CHECK-NEXT: s_mov_b32 s19, s4 -; CHECK-NEXT: v_writelane_b32 v0, s12, 5 -; CHECK-NEXT: v_writelane_b32 v0, s13, 6 -; CHECK-NEXT: v_writelane_b32 v0, s14, 7 -; CHECK-NEXT: v_writelane_b32 v0, s15, 8 -; CHECK-NEXT: v_writelane_b32 v0, s16, 9 -; CHECK-NEXT: v_writelane_b32 v0, s17, 10 -; CHECK-NEXT: v_writelane_b32 v0, s18, 11 -; CHECK-NEXT: v_writelane_b32 v0, s19, 12 +; CHECK-NEXT: v_writelane_b32 v0, s12, 4 +; CHECK-NEXT: v_writelane_b32 v0, s13, 5 +; CHECK-NEXT: v_writelane_b32 v0, s14, 6 +; CHECK-NEXT: v_writelane_b32 v0, s15, 7 +; CHECK-NEXT: v_writelane_b32 v0, s16, 8 +; CHECK-NEXT: v_writelane_b32 v0, s17, 9 +; CHECK-NEXT: v_writelane_b32 v0, s18, 10 +; CHECK-NEXT: v_writelane_b32 v0, s19, 11 ; CHECK-NEXT: v_mov_b32_e32 v7, v9 ; CHECK-NEXT: v_mov_b32_e32 v8, v10 ; CHECK-NEXT: v_mov_b32_e32 v5, v11 @@ -139,46 +137,42 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2] ; CHECK-NEXT: s_and_b32 s4, s4, s5 ; CHECK-NEXT: s_and_saveexec_b32 s4, s4 -; CHECK-NEXT: v_writelane_b32 v0, s4, 13 +; CHECK-NEXT: v_writelane_b32 v0, s4, 12 ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v2, 13 -; CHECK-NEXT: v_readlane_b32 s8, v2, 5 -; CHECK-NEXT: v_readlane_b32 s9, v2, 6 -; CHECK-NEXT: v_readlane_b32 s10, v2, 7 -; CHECK-NEXT: v_readlane_b32 s11, v2, 8 -; CHECK-NEXT: v_readlane_b32 s12, v2, 9 -; CHECK-NEXT: v_readlane_b32 s13, v2, 10 -; CHECK-NEXT: v_readlane_b32 s14, v2, 11 -; CHECK-NEXT: v_readlane_b32 s15, v2, 12 +; CHECK-NEXT: v_readlane_b32 s5, v2, 12 +; CHECK-NEXT: v_readlane_b32 s8, v2, 4 +; CHECK-NEXT: v_readlane_b32 s9, v2, 5 +; CHECK-NEXT: v_readlane_b32 s10, v2, 6 +; CHECK-NEXT: v_readlane_b32 s11, v2, 7 +; CHECK-NEXT: v_readlane_b32 s12, v2, 8 +; CHECK-NEXT: v_readlane_b32 s13, v2, 9 +; CHECK-NEXT: v_readlane_b32 s14, v2, 10 +; CHECK-NEXT: v_readlane_b32 s15, v2, 11 ; CHECK-NEXT: v_readlane_b32 s16, v2, 0 ; CHECK-NEXT: v_readlane_b32 s17, v2, 1 ; CHECK-NEXT: v_readlane_b32 s18, v2, 2 ; CHECK-NEXT: v_readlane_b32 s19, v2, 3 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: s_xor_b32 s4, exec_lo, s5 +; CHECK-NEXT: s_and_b32 s6, s4, -1 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s5 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v0, 4 -; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll index 8262cfd34823f..ef1d12e6ee278 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll @@ -41,8 +41,6 @@ define float @test_atomicrmw_fsub(ptr addrspace(3) %addr) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.atomicrmw.end: ; CHECK-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32), %bb.2 - ; CHECK-NEXT: [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INTRINSIC]](s64), %bb.2 - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[PHI2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %oldval = atomicrmw fsub ptr addrspace(3) %addr, float 1.0 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll index 6d32d4c720c99..6f5e15a60a0dd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -97,18 +97,18 @@ define void @i1_arg_i1_use(i1 %arg) #0 { ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[TRUNC]], [[C]] - ; CHECK-NEXT: [[INTRINSIC_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1) - ; CHECK-NEXT: G_BRCOND [[INTRINSIC_W_SIDE_EFFECTS]](s1), %bb.2 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s1), [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1) + ; CHECK-NEXT: G_BRCOND [[INT]](s1), %bb.2 ; CHECK-NEXT: G_BR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.bb1: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_STORE [[C1]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; CHECK-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wave.reconverge), [[INT1]](s64) ; CHECK-NEXT: G_BR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.bb2: - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_W_SIDE_EFFECTS1]](s64) ; CHECK-NEXT: SI_RETURN bb: br i1 %arg, label %bb2, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll index d3bc661f5940b..cff03b95decad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll @@ -41,8 +41,7 @@ define amdgpu_ps void @_amdgpu_ps_main(i1 %arg) { ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_3: ; %bb6 ; CHECK-NEXT: ; in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; CHECK-NEXT: s_and_b32 s2, 1, s2 +; CHECK-NEXT: s_and_b32 s2, 1, s3 ; CHECK-NEXT: v_or_b32_e32 v1, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s2, 0, s2 ; CHECK-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v0 @@ -54,12 +53,16 @@ define amdgpu_ps void @_amdgpu_ps_main(i1 %arg) { ; CHECK-NEXT: s_cbranch_vccz .LBB0_1 ; CHECK-NEXT: .LBB0_4: ; %bb2 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_mov_b32 s2, 0 -; CHECK-NEXT: s_and_saveexec_b32 s3, s0 -; CHECK-NEXT: s_cbranch_execz .LBB0_3 +; CHECK-NEXT: s_and_b32 s4, s0, exec_lo +; CHECK-NEXT: s_mov_b32 s2, exec_lo +; CHECK-NEXT: s_mov_b32 s3, 0 +; CHECK-NEXT: s_and_b32 s5, s4, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_3 ; CHECK-NEXT: ; %bb.5: ; %bb5 ; CHECK-NEXT: ; in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: s_mov_b32 s2, 1 +; CHECK-NEXT: s_mov_b32 s3, 1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; CHECK-NEXT: s_branch .LBB0_3 bb: %i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll index d7b7f03d428bf..f15e923755461 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -10,14 +10,14 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX10-NEXT: ; %bb.1: ; %mid +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: .LBB0_2: ; %bb -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: .LBB0_2: ; %bb ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -30,13 +30,13 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX11-NEXT: ; %bb.1: ; %mid +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: .LBB0_2: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: .LBB0_2: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -49,12 +49,12 @@ entry: mid: store volatile i32 0, ptr addrspace(1) undef + call void @llvm.amdgcn.wave.reconverge.i32(i32 %saved) br label %bb bb: - call void @llvm.amdgcn.end.cf.i32(i32 %saved) store volatile i32 0, ptr addrspace(1) undef ret void } -declare void @llvm.amdgcn.end.cf.i32(i32 %val) +declare void @llvm.amdgcn.wave.reconverge.i32(i32 %val) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll index 81d8472ebd46e..661ef413f76a3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll @@ -9,13 +9,12 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) { ; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %mid +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: .LBB0_2: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-NEXT: .LBB0_2: ; %bb ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -26,12 +25,12 @@ entry: mid: store volatile i32 0, ptr addrspace(1) undef + call void @llvm.amdgcn.wave.reconverge.i64(i64 %saved) br label %bb bb: - call void @llvm.amdgcn.end.cf.i64(i64 %saved) store volatile i32 0, ptr addrspace(1) undef ret void } -declare void @llvm.amdgcn.end.cf.i64(i64 %val) +declare void @llvm.amdgcn.wave.reconverge.i64(i64 %val) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 6e96a4ddbc0b3..c4b33790e0236 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -157,7 +157,6 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX1030-NEXT: v_mov_b32_e32 v31, v10 ; GFX1030-NEXT: v_mov_b32_e32 v19, v11 ; GFX1030-NEXT: v_mov_b32_e32 v20, v12 -; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v19 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v20 @@ -168,6 +167,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[21:31], s[4:7] +; GFX1030-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr21 ; GFX1030-NEXT: ; implicit-def: $vgpr22 @@ -181,10 +181,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX1030-NEXT: ; implicit-def: $vgpr30 ; GFX1030-NEXT: ; implicit-def: $vgpr31 ; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 -; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1030-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1030-NEXT: s_and_b32 s2, s1, -1 +; GFX1030-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1030-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1030-NEXT: ; %bb.2: -; GFX1030-NEXT: s_mov_b32 exec_lo, s1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ; return to shader part epilog ; @@ -192,7 +192,6 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX1013: ; %bb.0: ; GFX1013-NEXT: v_mov_b32_e32 v19, v11 ; GFX1013-NEXT: v_mov_b32_e32 v20, v12 -; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v19 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v20 @@ -203,14 +202,15 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh_intersect_ray v[15:18], v[0:10], s[4:7] +; GFX1013-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1013-NEXT: ; implicit-def: $vgpr19 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 ; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 +; GFX1013-NEXT: s_and_b32 s2, s1, -1 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1013-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1013-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1013-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1013-NEXT: ; %bb.2: -; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, v15 ; GFX1013-NEXT: v_mov_b32_e32 v1, v16 @@ -224,7 +224,6 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v16, v3 ; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v11 ; GFX11-NEXT: v_mov_b32_e32 v19, v12 -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s4, v18 @@ -238,6 +237,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v20, v21, v[15:17], v[5:7], v[8:10]], s[4:7] +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr18 ; GFX11-NEXT: ; implicit-def: $vgpr20 ; GFX11-NEXT: ; implicit-def: $vgpr21 @@ -245,10 +245,11 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7 ; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10 ; GFX11-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s2, s1, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) @@ -259,46 +260,46 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v18, v0 -; GFX1030-NEXT: v_mov_b32_e32 v19, v1 +; GFX1030-NEXT: v_mov_b32_e32 v16, v0 +; GFX1030-NEXT: v_mov_b32_e32 v17, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; GFX1030-NEXT: v_mov_b32_e32 v20, v2 +; GFX1030-NEXT: v_mov_b32_e32 v18, v2 ; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v8 -; GFX1030-NEXT: v_mov_b32_e32 v21, v3 +; GFX1030-NEXT: v_mov_b32_e32 v19, v3 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mov_b32_e32 v22, v4 -; GFX1030-NEXT: v_mov_b32_e32 v16, v9 -; GFX1030-NEXT: v_mov_b32_e32 v17, v10 -; GFX1030-NEXT: v_and_or_b32 v23, 0xffff, v5, v0 -; GFX1030-NEXT: v_and_or_b32 v24, 0xffff, v6, v1 -; GFX1030-NEXT: v_alignbit_b32 v25, v2, v7, 16 -; GFX1030-NEXT: s_mov_b32 s1, exec_lo +; GFX1030-NEXT: v_mov_b32_e32 v20, v4 +; GFX1030-NEXT: v_mov_b32_e32 v14, v9 +; GFX1030-NEXT: v_mov_b32_e32 v15, v10 +; GFX1030-NEXT: v_and_or_b32 v21, 0xffff, v5, v0 +; GFX1030-NEXT: v_and_or_b32 v22, 0xffff, v6, v1 +; GFX1030-NEXT: v_alignbit_b32 v23, v2, v7, 16 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v16 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v17 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v14 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v15 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v11 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v12 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17] +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15] ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[18:25], s[4:7] a16 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[16:23], s[4:7] a16 +; GFX1030-NEXT: s_xor_b32 s1, exec_lo, s0 +; GFX1030-NEXT: ; implicit-def: $vgpr14 ; GFX1030-NEXT: ; implicit-def: $vgpr16 +; GFX1030-NEXT: ; implicit-def: $vgpr17 ; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 ; GFX1030-NEXT: ; implicit-def: $vgpr21 ; GFX1030-NEXT: ; implicit-def: $vgpr22 ; GFX1030-NEXT: ; implicit-def: $vgpr23 -; GFX1030-NEXT: ; implicit-def: $vgpr24 -; GFX1030-NEXT: ; implicit-def: $vgpr25 ; GFX1030-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 -; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1030-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1030-NEXT: s_and_b32 s2, s1, -1 +; GFX1030-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1030-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1030-NEXT: ; %bb.2: -; GFX1030-NEXT: s_mov_b32 exec_lo, s1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ; return to shader part epilog ; @@ -309,7 +310,6 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; GFX1013-NEXT: v_and_b32_e32 v10, 0xffff, v7 ; GFX1013-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16 @@ -325,14 +325,15 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh_intersect_ray v[13:16], v[0:7], s[4:7] a16 +; GFX1013-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1013-NEXT: ; implicit-def: $vgpr17 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 +; GFX1013-NEXT: s_and_b32 s2, s1, -1 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1013-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1013-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1013-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1013-NEXT: ; %bb.2: -; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, v13 ; GFX1013-NEXT: v_mov_b32_e32 v1, v14 @@ -351,29 +352,30 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX11-NEXT: v_lshl_or_b32 v4, v5, 16, v0 ; GFX11-NEXT: v_perm_b32 v5, v5, v7, 0x7060302 ; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v1 -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_readfirstlane_b32 s4, v18 ; GFX11-NEXT: v_readfirstlane_b32 s5, v19 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v16, v17, v[13:15], v[4:6]], s[4:7] a16 +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr18 ; GFX11-NEXT: ; implicit-def: $vgpr16 ; GFX11-NEXT: ; implicit-def: $vgpr17 ; GFX11-NEXT: ; implicit-def: $vgpr13_vgpr14_vgpr15 ; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_and_b32 s2, s1, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) @@ -398,7 +400,6 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX1030-NEXT: v_mov_b32_e32 v33, v11 ; GFX1030-NEXT: v_mov_b32_e32 v20, v12 ; GFX1030-NEXT: v_mov_b32_e32 v21, v13 -; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v20 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v21 @@ -409,6 +410,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[22:33], s[4:7] +; GFX1030-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1030-NEXT: ; implicit-def: $vgpr20 ; GFX1030-NEXT: ; implicit-def: $vgpr22 ; GFX1030-NEXT: ; implicit-def: $vgpr23 @@ -423,10 +425,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX1030-NEXT: ; implicit-def: $vgpr32 ; GFX1030-NEXT: ; implicit-def: $vgpr33 ; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 -; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1030-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1030-NEXT: s_and_b32 s2, s1, -1 +; GFX1030-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1030-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1030-NEXT: ; %bb.2: -; GFX1030-NEXT: s_mov_b32 exec_lo, s1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ; return to shader part epilog ; @@ -434,7 +436,6 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX1013: ; %bb.0: ; GFX1013-NEXT: v_mov_b32_e32 v20, v12 ; GFX1013-NEXT: v_mov_b32_e32 v21, v13 -; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v20 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v21 @@ -445,14 +446,15 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh64_intersect_ray v[16:19], v[0:11], s[4:7] +; GFX1013-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1013-NEXT: ; implicit-def: $vgpr20 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 +; GFX1013-NEXT: s_and_b32 s2, s1, -1 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1013-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1013-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1013-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1013-NEXT: ; %bb.2: -; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, v16 ; GFX1013-NEXT: v_mov_b32_e32 v1, v17 @@ -466,7 +468,6 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX11-NEXT: v_dual_mov_b32 v21, v2 :: v_dual_mov_b32 v16, v3 ; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v5 ; GFX11-NEXT: v_dual_mov_b32 v4, v12 :: v_dual_mov_b32 v5, v13 -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s4, v4 @@ -480,6 +481,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7] +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: ; implicit-def: $vgpr19_vgpr20 ; GFX11-NEXT: ; implicit-def: $vgpr21 @@ -487,10 +489,11 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11 ; GFX11-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s2, s1, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) @@ -517,7 +520,6 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1030-NEXT: v_and_or_b32 v25, 0xffff, v6, v0 ; GFX1030-NEXT: v_and_or_b32 v26, 0xffff, v7, v1 ; GFX1030-NEXT: v_alignbit_b32 v27, v2, v8, 16 -; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v17 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v18 @@ -528,6 +530,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[19:27], s[4:7] a16 +; GFX1030-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1030-NEXT: ; implicit-def: $vgpr17 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 @@ -539,10 +542,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1030-NEXT: ; implicit-def: $vgpr26 ; GFX1030-NEXT: ; implicit-def: $vgpr27 ; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 -; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1030-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1030-NEXT: s_and_b32 s2, s1, -1 +; GFX1030-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1030-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1030-NEXT: ; %bb.2: -; GFX1030-NEXT: s_mov_b32 exec_lo, s1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ; return to shader part epilog ; @@ -553,7 +556,6 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6 ; GFX1013-NEXT: v_and_b32_e32 v11, 0xffff, v8 ; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 @@ -569,14 +571,15 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh64_intersect_ray v[14:17], v[0:8], s[4:7] a16 +; GFX1013-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1013-NEXT: ; implicit-def: $vgpr18 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX1013-NEXT: s_and_b32 s2, s1, -1 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1013-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1013-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1013-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1013-NEXT: ; %bb.2: -; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, v14 ; GFX1013-NEXT: v_mov_b32_e32 v1, v15 @@ -595,29 +598,30 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX11-NEXT: v_lshl_or_b32 v20, v6, 16, v0 ; GFX11-NEXT: v_perm_b32 v21, v6, v8, 0x7060302 ; GFX11-NEXT: v_lshl_or_b32 v22, v7, 16, v1 -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_readfirstlane_b32 s4, v4 ; GFX11-NEXT: v_readfirstlane_b32 s5, v5 ; GFX11-NEXT: v_readfirstlane_b32 s6, v12 ; GFX11-NEXT: v_readfirstlane_b32 s7, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[20:22]], s[4:7] a16 +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: ; implicit-def: $vgpr17_vgpr18 ; GFX11-NEXT: ; implicit-def: $vgpr19 ; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16 ; GFX11-NEXT: ; implicit-def: $vgpr20_vgpr21_vgpr22 ; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_and_b32 s2, s1, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll index 4a151aeca87e4..84f99ee572af4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll @@ -187,7 +187,6 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i3 ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_AND_OR_B32_e64_]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -210,14 +209,12 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i3 ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY7]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll index 570a39d0fa5fb..ef817663ebaab 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll @@ -179,7 +179,6 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -205,14 +204,12 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -232,7 +229,6 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -258,14 +254,12 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -291,7 +285,6 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -317,14 +310,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -343,7 +334,6 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -369,14 +359,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll index c2799e5836a97..c87ad2999c113 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll @@ -105,7 +105,6 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -133,14 +132,12 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; GFX8-NEXT: [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX8-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN]].sub0 - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[COPY15]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -162,7 +159,6 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -190,14 +186,12 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0 - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -225,7 +219,6 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -252,14 +245,12 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -280,7 +271,6 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -307,14 +297,12 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__ ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -497,7 +485,6 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -525,14 +512,12 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3 ; GFX8-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX8-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1 - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0 ; GFX8-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1 @@ -563,7 +548,6 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -591,14 +575,12 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3 ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1 - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0 ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1 @@ -635,7 +617,6 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__ ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -662,14 +643,12 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3 ; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -694,7 +673,6 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -721,14 +699,12 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__ ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3 ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll index c96fc017ae936..bdfbe35762d25 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -154,7 +154,6 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -180,14 +179,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -206,7 +203,6 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -232,14 +228,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -261,7 +255,6 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -287,14 +280,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -311,7 +302,6 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -337,14 +327,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll index 36d5e914d40be..8ef57a0546514 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll @@ -210,7 +210,6 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -236,14 +235,12 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -261,7 +258,6 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -287,14 +283,12 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -312,7 +306,6 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -338,14 +331,12 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_VBUFFER_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll index 23efaa4d2bd91..1a0907848ddda 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll @@ -184,7 +184,6 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -210,14 +209,12 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -235,7 +232,6 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -261,14 +257,12 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll index 102a9bd840b09..fa023caea1243 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll @@ -89,7 +89,6 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -112,14 +111,12 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -136,7 +133,6 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -159,14 +155,12 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -188,7 +182,6 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -214,14 +207,12 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -238,7 +229,6 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -264,14 +254,12 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -894,7 +882,6 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -917,14 +904,12 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -941,7 +926,6 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -964,14 +948,12 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -993,7 +975,6 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -1016,14 +997,12 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1040,7 +1019,6 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1063,14 +1041,12 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1487,7 +1463,6 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def dead $scc - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -1510,14 +1485,12 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1536,7 +1509,6 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def dead $scc - ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1559,14 +1531,12 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1592,7 +1562,6 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -1615,14 +1584,12 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1639,7 +1606,6 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1662,14 +1628,12 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll index 6541085b72e54..19146371b9d5c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -233,7 +233,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -256,14 +255,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -282,7 +279,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -305,14 +301,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -331,7 +325,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -354,14 +347,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -668,7 +659,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -691,14 +681,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -720,7 +708,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -743,14 +730,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -769,7 +754,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -792,14 +776,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 4096, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll index 1f89150f09ced..9ad85e563b269 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -207,7 +207,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -230,14 +229,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -258,7 +255,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -281,14 +277,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_FORMAT_XYZW_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -514,7 +508,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -537,14 +530,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -565,7 +556,6 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -588,14 +578,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_FORMAT_XYZW_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4096, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll index 030f8dae0ef79..2f43b4c4567ab 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -92,7 +92,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -115,14 +114,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -139,7 +136,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -162,14 +158,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -191,7 +185,6 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -205,14 +198,12 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -229,7 +220,6 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -243,14 +233,12 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -272,7 +260,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -298,14 +285,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -322,7 +307,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -348,14 +332,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -912,7 +894,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -935,14 +916,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -961,7 +940,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -984,14 +962,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1366,7 +1342,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -1389,14 +1364,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1413,7 +1386,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1436,14 +1408,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 5000 @@ -1467,7 +1437,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -1490,14 +1459,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -1513,7 +1480,6 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1536,14 +1502,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 5000, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll index 93d68443c7843..67516e51fab8d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll @@ -108,7 +108,6 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -134,14 +133,12 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -167,7 +164,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc_ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -193,14 +189,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc_ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll index 56b2d0452dd45..d868bf8148673 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll @@ -67,7 +67,6 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__v ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -95,14 +94,12 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__v ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN]].sub0 - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[COPY15]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -130,7 +127,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_c ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -157,14 +153,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_c ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -279,7 +273,6 @@ define amdgpu_ps double @raw_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -307,14 +300,12 @@ define amdgpu_ps double @raw_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__ ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3 ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1 - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0 ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1 @@ -351,7 +342,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_c ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -378,14 +368,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_c ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3 ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll index 999f42ff905ab..9b442c65204a7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll @@ -154,7 +154,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -180,14 +179,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -206,7 +203,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -232,14 +228,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -261,7 +255,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -287,14 +280,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -311,7 +302,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -337,14 +327,12 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc_ ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll index 5b19b1c913a94..852a05601550d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll @@ -161,7 +161,6 @@ define amdgpu_ps half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__ ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -187,14 +186,12 @@ define amdgpu_ps half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__ ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -212,7 +209,6 @@ define amdgpu_ps half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -238,14 +234,12 @@ define amdgpu_ps half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__ ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll index 2dc688db86e4f..d7cc96c0df30a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll @@ -108,7 +108,6 @@ define amdgpu_ps float @raw_ptr_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset_ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -134,14 +133,12 @@ define amdgpu_ps float @raw_ptr_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset_ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll index 7b8b028128dd3..5906745d5c3f9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -57,7 +57,6 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -80,14 +79,12 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -109,7 +106,6 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_s ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -135,14 +131,12 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_s ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -526,7 +520,6 @@ define amdgpu_ps half @raw_ptr_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_so ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -549,14 +542,12 @@ define amdgpu_ps half @raw_ptr_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_so ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -578,7 +569,6 @@ define amdgpu_ps float @raw_ptr_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_so ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -601,14 +591,12 @@ define amdgpu_ps float @raw_ptr_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_so ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -857,7 +845,6 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -880,14 +867,12 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -913,7 +898,6 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -936,14 +920,12 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll index 3ed6bbdd36156..d10f901da3bcb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll @@ -171,7 +171,6 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -194,14 +193,12 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -220,7 +217,6 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -243,14 +239,12 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -482,7 +476,6 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY5]], implicit $exec ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -505,14 +498,12 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -534,7 +525,6 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec ; PACKED-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -557,14 +547,12 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll index dee83a9b0a6ec..cf996f6d98969 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll @@ -123,7 +123,6 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -146,14 +145,12 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -294,7 +291,6 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -317,14 +313,12 @@ define amdgpu_ps void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_vo ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll index 2c99ce8694bcc..cd3ad3e4ab5ba 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -59,7 +59,6 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -82,14 +81,12 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -111,7 +108,6 @@ define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -125,14 +121,12 @@ define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -154,7 +148,6 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -180,14 +173,12 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -527,7 +518,6 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -550,14 +540,12 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -784,7 +772,6 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -807,14 +794,12 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %voffset.add = add i32 %voffset, 5000 @@ -838,7 +823,6 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -861,14 +845,12 @@ define amdgpu_ps void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY5]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 5000, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll index a799e203d6439..59e740851a145 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll @@ -159,7 +159,6 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -185,14 +184,12 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -210,7 +207,6 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -236,14 +232,12 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll index 3e135472ebbb1..37c1cfa42873f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll @@ -107,7 +107,6 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -133,14 +132,12 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll index 725faa1b4a49f..0a44bb50b8231 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll @@ -138,7 +138,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -161,14 +160,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -185,7 +182,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -208,14 +204,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -237,7 +231,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -263,14 +256,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -287,7 +278,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -313,14 +303,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -343,7 +331,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -369,14 +356,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -394,7 +379,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -420,14 +404,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll index a12a6005df24e..02ab99c6e0180 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll @@ -51,7 +51,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -74,14 +73,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -98,7 +95,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -121,14 +117,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.i8(i8 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -150,7 +144,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -176,14 +169,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -200,7 +191,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -226,14 +216,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.i8(i8 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -256,7 +244,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -282,14 +269,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -307,7 +292,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -333,14 +317,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.i8(i8 %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll index 9db5c160a6236..e195985018405 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll @@ -127,7 +127,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -150,14 +149,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 1, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 1) @@ -179,7 +176,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -205,14 +201,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -235,7 +229,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -261,14 +254,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -597,7 +588,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -620,14 +610,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %soffset = add i32 %soffset.base, 5000 @@ -653,7 +641,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -676,14 +663,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 904, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %voffset = add i32 %voffset.base, 5000 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll index 1cfb15391be36..98e1614e0707d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll @@ -208,7 +208,6 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -234,14 +233,12 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -259,7 +256,6 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -285,14 +281,12 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -310,7 +304,6 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -336,14 +329,12 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_VBUFFER_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll index 4f8b20d10c874..64f6756c534ea 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll @@ -183,7 +183,6 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX10_GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX10_GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX10_GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) @@ -209,14 +208,12 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX10_GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX10_GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.4: ; GFX10_GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.5: ; GFX10_GFX11-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX10_GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -234,7 +231,6 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -260,14 +256,12 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN [[COPY6]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll index b9d0cb52d2405..7c1e02a63c98e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll @@ -186,7 +186,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -209,14 +208,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -233,7 +230,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -256,14 +252,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -280,7 +274,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -303,14 +296,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_D16_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -332,7 +323,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -358,14 +348,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -382,7 +370,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -408,14 +395,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -432,7 +417,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -458,14 +442,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_D16_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -488,7 +470,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -514,14 +495,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -539,7 +518,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -565,14 +543,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -590,7 +566,6 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -616,14 +591,12 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_D16_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll index c1fb4aacafe1d..86d00763159d1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll @@ -67,7 +67,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -90,14 +89,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -114,7 +111,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -137,14 +133,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -161,7 +155,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -184,14 +177,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -213,7 +204,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -239,14 +229,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -263,7 +251,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -289,14 +276,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -313,7 +298,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -339,14 +323,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -369,7 +351,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -395,14 +376,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; UNPACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -420,7 +399,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -446,14 +424,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; PACKED-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -471,7 +447,6 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -497,14 +472,12 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll index 09227af922a6e..65992c2cebf62 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll @@ -213,7 +213,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX10_GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX10_GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX10_GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) @@ -236,14 +235,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX10_GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX10_GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.4: ; GFX10_GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.5: ; GFX10_GFX11-NEXT: S_ENDPGM 0 ; @@ -260,7 +257,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -283,14 +279,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 94, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 1) @@ -312,7 +306,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX10_GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX10_GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; GFX10_GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) @@ -338,14 +331,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX10_GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX10_GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.4: ; GFX10_GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.5: ; GFX10_GFX11-NEXT: S_ENDPGM 0 ; @@ -362,7 +353,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -388,14 +378,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -418,7 +406,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX10_GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX10_GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX10_GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) @@ -444,14 +431,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX10_GFX11-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX10_GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX10_GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.4: ; GFX10_GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.5: ; GFX10_GFX11-NEXT: S_ENDPGM 0 ; @@ -469,7 +454,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -495,14 +479,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -1059,7 +1041,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX10_GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; GFX10_GFX11-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def dead $scc - ; GFX10_GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) @@ -1082,14 +1063,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX10_GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX10_GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.4: ; GFX10_GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] - ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.5: ; GFX10_GFX11-NEXT: S_ENDPGM 0 ; @@ -1108,7 +1087,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def dead $scc - ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1131,14 +1109,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %soffset = add i32 %soffset.base, 5000 @@ -1164,7 +1140,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX10_GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX10_GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; GFX10_GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.2: ; GFX10_GFX11-NEXT: successors: %bb.3(0x80000000) @@ -1187,14 +1162,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX10_GFX11-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 904, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX10_GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX10_GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX10_GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.4: ; GFX10_GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX10_GFX11-NEXT: {{ $}} - ; GFX10_GFX11-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]] - ; GFX10_GFX11-NEXT: {{ $}} ; GFX10_GFX11-NEXT: bb.5: ; GFX10_GFX11-NEXT: S_ENDPGM 0 ; @@ -1211,7 +1184,6 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -1234,14 +1206,12 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 5000, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %voffset = add i32 %voffset.base, 5000 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll index 1a2a50f444ddb..a235a21002e41 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -3620,7 +3620,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -3643,14 +3642,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3668,7 +3665,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -3691,14 +3687,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3716,7 +3710,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -3739,14 +3732,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3763,7 +3754,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -3786,14 +3776,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY5]], [[REG_SEQUENCE1]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3814,7 +3802,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -3837,14 +3824,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3860,7 +3845,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -3883,14 +3867,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3906,7 +3888,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -3929,14 +3910,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3952,7 +3931,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -3975,14 +3953,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4008,7 +3984,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -4031,14 +4006,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4058,7 +4031,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX7-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -4081,14 +4053,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4108,7 +4078,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -4131,14 +4100,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4154,7 +4121,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -4177,14 +4143,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4096, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4206,7 +4170,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -4229,14 +4192,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4252,7 +4213,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -4275,14 +4235,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4298,7 +4256,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -4321,14 +4278,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4343,7 +4298,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -4366,14 +4320,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE1]], $sgpr_null, 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4396,7 +4348,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -4419,14 +4370,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX6-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4444,7 +4393,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -4467,14 +4415,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX7-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4490,7 +4436,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -4513,14 +4458,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX8-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4095, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4096) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4535,7 +4478,6 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -4558,14 +4500,12 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE1]], $sgpr_null, 4096, 0, 0, implicit $exec :: (dereferenceable invariant load (s32) from unknown-address + 4096) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4587,7 +4527,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -4611,14 +4550,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -4650,7 +4587,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -4674,14 +4610,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -4713,7 +4647,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -4737,14 +4670,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -4776,7 +4707,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -4800,14 +4730,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -4850,7 +4778,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -4874,14 +4801,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -4917,7 +4842,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX7-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -4941,14 +4865,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -4984,7 +4906,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -5008,14 +4929,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5047,7 +4966,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -5071,14 +4989,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4068, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4084, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5119,7 +5035,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -5143,14 +5058,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5186,7 +5099,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX7-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -5210,14 +5122,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5253,7 +5163,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -5277,14 +5186,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5316,7 +5223,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -5340,14 +5246,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4096, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], [[COPY4]], 4112, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5385,7 +5289,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -5409,14 +5312,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5449,7 +5350,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -5473,14 +5373,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5513,7 +5411,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4064 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -5537,14 +5434,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 936, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 952, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5576,7 +5471,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -5600,14 +5494,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], $sgpr_null, 5000, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], $sgpr_null, 5016, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5645,7 +5537,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -5669,14 +5560,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5709,7 +5598,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -5733,14 +5621,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5773,7 +5659,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 12 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -5797,14 +5682,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5836,7 +5719,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -5860,14 +5742,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], $sgpr_null, 4076, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], $sgpr_null, 4092, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5905,7 +5785,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -5929,14 +5808,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -5969,7 +5846,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -5993,14 +5869,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -6033,7 +5907,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -6057,14 +5930,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -6096,7 +5967,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -6120,14 +5990,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], $sgpr_null, 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], $sgpr_null, 4096, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -6164,7 +6032,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.2: ; GFX6-NEXT: successors: %bb.3(0x80000000) @@ -6188,14 +6055,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX6-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX6-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX6-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX6-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.4: ; GFX6-NEXT: successors: %bb.5(0x80000000) ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: bb.5: ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -6227,7 +6092,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.2: ; GFX7-NEXT: successors: %bb.3(0x80000000) @@ -6251,14 +6115,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: ; GFX7-NEXT: successors: %bb.5(0x80000000) ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -6290,7 +6152,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -6314,14 +6175,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX8-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 @@ -6352,7 +6211,6 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -6376,14 +6234,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], $sgpr_null, 4064, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE1]], $sgpr_null, 4080, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET1]], %subreg.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll index ab720ce8f942c..b059a5e1466ea 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll @@ -197,7 +197,6 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -224,14 +223,12 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX8-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -253,7 +250,6 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -280,14 +276,12 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_ ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -315,7 +309,6 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -342,14 +335,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX8-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -370,7 +361,6 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -397,14 +387,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__ ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX12-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll index f9f70ecadfe60..abf0cffa1ec09 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll @@ -114,7 +114,6 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -143,14 +142,12 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX8-NEXT: [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX8-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN]].sub0 - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[COPY17]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -174,7 +171,6 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -203,14 +199,12 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0 - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -240,7 +234,6 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -268,14 +261,12 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1 ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -298,7 +289,6 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -326,14 +316,12 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1 ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -530,7 +518,6 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; GFX8-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -559,14 +546,12 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX8-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3 ; GFX8-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX8-NEXT: [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1 - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0 ; GFX8-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1 @@ -599,7 +584,6 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -628,14 +612,12 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v ; GFX12-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3 ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1 - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0 ; GFX12-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1 @@ -674,7 +656,6 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; GFX8-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -702,14 +683,12 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm ; GFX8-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1 ; GFX8-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3 ; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -736,7 +715,6 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -764,14 +742,12 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1 ; GFX12-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3 ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll index 8589fe9fd056d..7c216f071ae9f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -168,7 +168,6 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -195,14 +194,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -223,7 +220,6 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -250,14 +246,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -281,7 +275,6 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -307,14 +300,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -333,7 +324,6 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -359,14 +349,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__ ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll index 870588014cd29..10203ad185ada 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll @@ -229,7 +229,6 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -256,14 +255,12 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 @@ -304,7 +301,6 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -331,14 +327,12 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 @@ -361,7 +355,6 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -388,14 +381,12 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN]].sub0 ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll index 06bd45a45cced..9c950039ab378 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll @@ -202,7 +202,6 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -229,14 +228,12 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX8-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; GFX8-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 @@ -263,7 +260,6 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -290,14 +286,12 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX12-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN]].sub0 ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll index 94ce8aac8a4c6..5cc6f8cc726e0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -324,7 +324,6 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -351,14 +350,12 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -378,7 +375,6 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -405,14 +401,12 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll index f62a15d470afd..7d9766d667f1a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll @@ -207,7 +207,6 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -234,14 +233,12 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -262,7 +259,6 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -289,14 +285,12 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 ; @@ -317,7 +311,6 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -344,14 +337,12 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX12-NEXT: BUFFER_STORE_FORMAT_D16_X_VBUFFER_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll index 8a395f0e73222..39e74fc05570a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll @@ -190,7 +190,6 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; GFX8-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX8-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX8-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -217,14 +216,12 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX8-NEXT: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -245,7 +242,6 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -272,14 +268,12 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX12-NEXT: BUFFER_STORE_FORMAT_X_VBUFFER_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll index b89ed46ba0550..b82cfc99ca163 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll @@ -195,7 +195,6 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; GFX8-NEXT: [[COPY11:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] ; GFX8-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; GFX8-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.2: ; GFX8-NEXT: successors: %bb.3(0x80000000) @@ -222,14 +221,12 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 ; GFX8-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) - ; GFX8-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX8-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX8-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX8-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.4: ; GFX8-NEXT: successors: %bb.5(0x80000000) ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: bb.5: ; GFX8-NEXT: S_ENDPGM 0 ; @@ -254,7 +251,6 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -281,14 +277,12 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll index 2e7323068d108..8eb8db64534ce 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll @@ -118,7 +118,6 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__s ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -145,14 +144,12 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__s ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -180,7 +177,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rs ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -207,14 +203,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rs ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll index e8e6cab4edbe8..2364e5009baf8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll @@ -73,7 +73,6 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -102,14 +101,12 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN]].sub0 - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[COPY17]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -139,7 +136,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgp ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -167,14 +163,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgp ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -297,7 +291,6 @@ define amdgpu_ps double @struct_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cm ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -326,14 +319,12 @@ define amdgpu_ps double @struct_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cm ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3 ; CHECK-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1 - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0 ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1 @@ -372,7 +363,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgp ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -400,14 +390,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgp ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3 ; CHECK-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 %ret = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll index 54657982493f7..542085ddca2c4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll @@ -168,7 +168,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -195,14 +194,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -223,7 +220,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -250,14 +246,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -281,7 +275,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: ; GFX908-NEXT: successors: %bb.3(0x80000000) @@ -307,14 +300,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX908-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: ; GFX908-NEXT: successors: %bb.5(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.5: ; GFX908-NEXT: S_ENDPGM 0 ; @@ -333,7 +324,6 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) @@ -359,14 +349,12 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rs ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) - ; GFX90A-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX90A-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX90A-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll index 6c0319ef570d6..323d77665e5f0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll @@ -174,7 +174,6 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -201,14 +200,12 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; UNPACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 @@ -249,7 +246,6 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -276,14 +272,12 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; PACKED-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll index 1e3f94a5e39cb..6964a4d16445d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll @@ -118,7 +118,6 @@ define amdgpu_ps <4 x float> @struct_ptr_buffer_load_format_v4f32__vpr_rsrc__sgp ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -145,14 +144,12 @@ define amdgpu_ps <4 x float> @struct_ptr_buffer_load_format_v4f32__vpr_rsrc__sgp ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll index 66c62e9ce8a9c..eb85c33bfe352 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll @@ -188,7 +188,6 @@ define amdgpu_ps float @struct_ptr_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -215,14 +214,12 @@ define amdgpu_ps float @struct_ptr_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll index 25fe7d2877ce3..458fa61dc517c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll @@ -153,7 +153,6 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f16__sgpr_val__vgpr_rsrc__ ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -180,14 +179,12 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f16__sgpr_val__vgpr_rsrc__ ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; UNPACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: S_ENDPGM 0 ; @@ -208,7 +205,6 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f16__sgpr_val__vgpr_rsrc__ ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; PACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; PACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -235,14 +231,12 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f16__sgpr_val__vgpr_rsrc__ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.ptr.buffer.store.format.f16(half %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll index 3a4c258537814..686d2fad9e8d4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll @@ -112,7 +112,6 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f32__sgpr_val__vgpr_rsrc__ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -139,14 +138,12 @@ define amdgpu_ps void @struct_ptr_buffer_store_format_f32__sgpr_val__vgpr_rsrc__ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 ; CHECK-NEXT: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.ptr.buffer.store.format.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll index 2e0a12b9d969c..d5ee650f197aa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll @@ -117,7 +117,6 @@ define amdgpu_ps void @struct_ptr_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_v ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -144,14 +143,12 @@ define amdgpu_ps void @struct_ptr_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_v ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 ; CHECK-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll index 1a57c2e77bddf..b734fba79dd71 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll @@ -215,7 +215,6 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -242,14 +241,12 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 @@ -272,7 +269,6 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -299,14 +295,12 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>) from %ir.rsrc, align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll index 63143ed718054..3c639fa1c1b99 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll @@ -140,7 +140,6 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -167,14 +166,12 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll index f270f87aae66d..ce4e0dbb02906 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -288,7 +288,6 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.2: ; PACKED-NEXT: successors: %bb.3(0x80000000) @@ -315,14 +314,12 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; PACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) - ; PACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; PACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; PACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; PACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.4: ; PACKED-NEXT: successors: %bb.5(0x80000000) ; PACKED-NEXT: {{ $}} - ; PACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; PACKED-NEXT: {{ $}} ; PACKED-NEXT: bb.5: ; PACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 @@ -345,7 +342,6 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; GFX12-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.2: ; GFX12-NEXT: successors: %bb.3(0x80000000) @@ -372,14 +368,12 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; GFX12-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) - ; GFX12-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: ; GFX12-NEXT: successors: %bb.5(0x80000000) ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN]].sub0 ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_VBUFFER_BOTHEN]].sub1 @@ -402,7 +396,6 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; UNPACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.2: ; UNPACKED-NEXT: successors: %bb.3(0x80000000) @@ -429,14 +422,12 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; UNPACKED-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8) - ; UNPACKED-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; UNPACKED-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; UNPACKED-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; UNPACKED-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.4: ; UNPACKED-NEXT: successors: %bb.5(0x80000000) ; UNPACKED-NEXT: {{ $}} - ; UNPACKED-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; UNPACKED-NEXT: {{ $}} ; UNPACKED-NEXT: bb.5: ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll index 7d3ecd363befb..955a092e435cb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll @@ -242,7 +242,6 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) @@ -269,14 +268,12 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 @@ -303,7 +300,6 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK-GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK-GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK-GFX12-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; CHECK-GFX12-NEXT: {{ $}} ; CHECK-GFX12-NEXT: bb.2: ; CHECK-GFX12-NEXT: successors: %bb.3(0x80000000) @@ -330,14 +326,12 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_ ; CHECK-GFX12-NEXT: {{ $}} ; CHECK-GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 ; CHECK-GFX12-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; CHECK-GFX12-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-GFX12-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-GFX12-NEXT: {{ $}} ; CHECK-GFX12-NEXT: bb.4: ; CHECK-GFX12-NEXT: successors: %bb.5(0x80000000) ; CHECK-GFX12-NEXT: {{ $}} - ; CHECK-GFX12-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-GFX12-NEXT: {{ $}} ; CHECK-GFX12-NEXT: bb.5: ; CHECK-GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN]].sub0 ; CHECK-GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_VBUFFER_BOTHEN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll index a36b25ccfa48e..a883a542077bf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -159,21 +159,23 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; SI: ; %bb.0: ; %.entry ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 -; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB2_3 +; SI-NEXT: s_xor_b64 s[4:5], vcc, -1 +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; SI-NEXT: s_cmov_b64 exec, s[4:5] +; SI-NEXT: s_cbranch_scc0 .LBB2_3 ; SI-NEXT: ; %bb.1: ; %.demote -; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; SI-NEXT: s_cbranch_scc0 .LBB2_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: .LBB2_3: ; %.continue -; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; SI-NEXT: s_endpgm @@ -186,21 +188,23 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; %.demote -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: .LBB2_3: ; %.continue -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX9-NEXT: s_endpgm @@ -213,21 +217,23 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, -1 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 -; GFX10-32-NEXT: s_cbranch_execz .LBB2_3 +; GFX10-32-NEXT: s_xor_b32 s2, vcc_lo, -1 +; GFX10-32-NEXT: s_and_b32 s2, s2, exec_lo +; GFX10-32-NEXT: s_and_b32 s3, s2, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote -; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: .LBB2_3: ; %.continue -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-32-NEXT: s_endpgm @@ -240,21 +246,23 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, -1 -; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; GFX10-64-NEXT: s_cbranch_execz .LBB2_3 +; GFX10-64-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX10-64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10-64-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX10-64-NEXT: s_cmov_b64 exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote -; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: .LBB2_3: ; %.continue -; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-64-NEXT: s_endpgm @@ -286,17 +294,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc -; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; SI-NEXT: s_cbranch_execz .LBB3_3 +; SI-NEXT: s_mov_b64 s[14:15], exec +; SI-NEXT: s_and_b64 s[16:17], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB3_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: .LBB3_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: .LBB3_3: ; %.continue ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v0 @@ -315,17 +324,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX9-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_and_b64 s[16:17], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: .LBB3_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: .LBB3_3: ; %.continue ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 @@ -344,17 +354,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 -; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 -; GFX10-32-NEXT: s_cbranch_execz .LBB3_3 +; GFX10-32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-32-NEXT: s_and_b32 s14, vcc_lo, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s14, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10-32-NEXT: .LBB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: .LBB3_3: ; %.continue ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 @@ -373,17 +384,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_mov_b64 s[12:13], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX10-64-NEXT: s_cbranch_execz .LBB3_3 +; GFX10-64-NEXT: s_mov_b64 s[14:15], exec +; GFX10-64-NEXT: s_and_b64 s[16:17], vcc, -1 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: .LBB3_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: .LBB3_3: ; %.continue ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 @@ -420,19 +432,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_mov_b64 s[14:15], exec ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc -; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; SI-NEXT: s_cbranch_execz .LBB4_3 +; SI-NEXT: s_and_b64 s[16:17], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB4_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 .LBB4_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: .LBB4_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: .LBB4_3: ; %.continue ; SI-NEXT: v_add_f32_e32 v0, v0, v0 ; SI-NEXT: s_and_b64 exec, exec, s[12:13] ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf @@ -449,19 +462,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_and_b64 s[16:17], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: .LBB4_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: .LBB4_3: ; %.continue ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf @@ -478,19 +492,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_mov_b32 s13, exec_lo ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 -; GFX10-32-NEXT: s_cbranch_execz .LBB4_3 +; GFX10-32-NEXT: s_and_b32 s14, vcc_lo, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s14, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10-32-NEXT: .LBB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: .LBB4_3: ; %.continue ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -507,19 +522,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_mov_b64 s[12:13], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_mov_b64 s[14:15], exec ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX10-64-NEXT: s_cbranch_execz .LBB4_3 +; GFX10-64-NEXT: s_and_b64 s[16:17], vcc, -1 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: .LBB4_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: .LBB4_3: ; %.continue ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -663,39 +679,42 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB6_3 +; SI-NEXT: s_xor_b64 s[2:3], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB6_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB6_7 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[4:5] -; SI-NEXT: .LBB6_3: ; %.continue0 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] -; SI-NEXT: s_mov_b64 s[2:3], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; SI-NEXT: .LBB6_3: ; %.continue0 +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: s_nop 1 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_nop 0 ; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; SI-NEXT: s_and_b64 exec, exec, s[0:1] ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[2:3], s[0:1], vcc -; SI-NEXT: s_xor_b64 s[2:3], s[2:3], -1 -; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB6_6 +; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; SI-NEXT: s_cmov_b64 exec, s[4:5] +; SI-NEXT: s_cbranch_scc0 .LBB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB6_7 ; SI-NEXT: ; %bb.5: ; %.demote1 ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: .LBB6_6: ; %.continue1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB6_6: ; %.continue1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 ; SI-NEXT: v_bfrev_b32_e32 v1, 60 ; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -711,39 +730,42 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB6_3: ; %.continue0 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX9-NEXT: .LBB6_3: ; %.continue0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], -1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_6 +; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX9-NEXT: ; %bb.4: ; %.demote1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX9-NEXT: ; %bb.5: ; %.demote1 ; GFX9-NEXT: s_mov_b64 exec, 0 -; GFX9-NEXT: .LBB6_6: ; %.continue1 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB6_6: ; %.continue1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -759,37 +781,40 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10-32-NEXT: s_cbranch_execz .LBB6_3 +; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10-32-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s2, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 -; GFX10-32-NEXT: .LBB6_3: ; %.continue0 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: s_mov_b32 s1, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 +; GFX10-32-NEXT: .LBB6_3: ; %.continue0 +; GFX10-32-NEXT: s_mov_b32 s2, s0 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_b32 s1, s0, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s1, s1, -1 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 -; GFX10-32-NEXT: s_cbranch_execz .LBB6_6 +; GFX10-32-NEXT: s_and_b32 s2, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s2, s2, -1 +; GFX10-32-NEXT: s_and_b32 s2, s2, exec_lo +; GFX10-32-NEXT: s_and_b32 s3, s2, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX10-32-NEXT: ; %bb.4: ; %.demote1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-32-NEXT: ; %bb.5: ; %.demote1 ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-32-NEXT: .LBB6_6: ; %.continue1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: .LBB6_6: ; %.continue1 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -805,37 +830,40 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-64-NEXT: s_cbranch_execz .LBB6_3 +; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10-64-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: .LBB6_3: ; %.continue0 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX10-64-NEXT: .LBB6_3: ; %.continue0 +; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] ; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_b64 s[2:3], s[0:1], vcc -; GFX10-64-NEXT: s_xor_b64 s[2:3], s[2:3], -1 -; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; GFX10-64-NEXT: s_cbranch_execz .LBB6_6 +; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GFX10-64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10-64-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX10-64-NEXT: s_cmov_b64 exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX10-64-NEXT: ; %bb.4: ; %.demote1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-64-NEXT: ; %bb.5: ; %.demote1 ; GFX10-64-NEXT: s_mov_b64 exec, 0 -; GFX10-64-NEXT: .LBB6_6: ; %.continue1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: .LBB6_6: ; %.continue1 ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -885,46 +913,50 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB7_3 +; SI-NEXT: s_xor_b64 s[2:3], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], vcc, -1 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB7_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[6:7] -; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_branch .LBB7_5 ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB7_8 +; SI-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; SI-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; SI-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; SI-NEXT: s_cbranch_scc0 .LBB7_8 ; SI-NEXT: .LBB7_5: ; %.continue0 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; SI-NEXT: s_mov_b64 s[6:7], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] ; SI-NEXT: v_mov_b32_e32 v3, v2 -; SI-NEXT: s_nop 1 +; SI-NEXT: s_mov_b64 s[4:5], exec +; SI-NEXT: s_nop 0 ; SI-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; SI-NEXT: s_cbranch_execz .LBB7_4 +; SI-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; SI-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; SI-NEXT: s_cmov_b64 exec, s[6:7] +; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -933,9 +965,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return -; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_and_b64 exec, exec, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 ; SI-NEXT: v_bfrev_b32_e32 v1, 60 @@ -951,46 +983,50 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] -; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_branch .LBB7_5 ; GFX9-NEXT: .LBB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB7_8 +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc0 .LBB7_8 ; GFX9-NEXT: .LBB7_5: ; %.continue0 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9-NEXT: ; %bb.6: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -999,9 +1035,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_branch .LBB7_4 ; GFX9-NEXT: .LBB7_8: ; %.return -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 @@ -1019,41 +1055,45 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: s_mov_b32 s1, 0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 -; GFX10-32-NEXT: s_cbranch_execz .LBB7_3 +; GFX10-32-NEXT: s_xor_b32 s2, vcc_lo, exec_lo +; GFX10-32-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 -; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-32-NEXT: s_branch .LBB7_5 ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1 ; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: s_cbranch_execz .LBB7_8 +; GFX10-32-NEXT: s_andn2_b32 s2, exec_lo, s1 +; GFX10-32-NEXT: s_and_b32 s3, s2, -1 +; GFX10-32-NEXT: s_cselect_b32 exec_lo, s2, s1 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_8 ; GFX10-32-NEXT: .LBB7_5: ; %.continue0 ; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-32-NEXT: s_mov_b32 s2, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 +; GFX10-32-NEXT: s_mov_b32 s3, s0 +; GFX10-32-NEXT: s_mov_b32 s2, exec_lo +; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s3 ; GFX10-32-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX10-32-NEXT: s_and_b32 s2, s0, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s2, s2, -1 -; GFX10-32-NEXT: s_and_saveexec_b32 s3, s2 -; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s3 -; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10-32-NEXT: s_and_b32 s3, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s3, s3, -1 +; GFX10-32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX10-32-NEXT: s_and_b32 s4, s3, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, s3 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10-32-NEXT: ; %bb.6: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo @@ -1062,9 +1102,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: s_branch .LBB7_4 ; GFX10-32-NEXT: .LBB7_8: ; %.return -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 @@ -1082,42 +1122,46 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: s_mov_b32 s4, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-64-NEXT: s_cbranch_execz .LBB7_3 +; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10-64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] -; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-64-NEXT: s_branch .LBB7_5 ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: s_cbranch_execz .LBB7_8 +; GFX10-64-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX10-64-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX10-64-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_8 ; GFX10-64-NEXT: .LBB7_5: ; %.continue0 ; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] +; GFX10-64-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_mov_b64 s[4:5], exec +; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] ; GFX10-64-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX10-64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10-64-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[6:7], -1 +; GFX10-64-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX10-64-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX10-64-NEXT: s_cmov_b64 exec, s[6:7] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10-64-NEXT: ; %bb.6: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -1126,9 +1170,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: s_branch .LBB7_4 ; GFX10-64-NEXT: .LBB7_8: ; %.return -; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll index 4d4da869d7507..895d3e5f4c1ce 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll @@ -8,9 +8,10 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src ; LOOP-LABEL: memmove_p1i8: ; LOOP: ; %bb.0: ; LOOP-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1] -; LOOP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; LOOP-NEXT: s_xor_b64 s[4:5], exec, s[0:1] -; LOOP-NEXT: s_cbranch_execz .LBB0_3 +; LOOP-NEXT: s_xor_b64 s[4:5], vcc, exec +; LOOP-NEXT: s_and_b64 s[0:1], vcc, -1 +; LOOP-NEXT: s_cmov_b64 exec, vcc +; LOOP-NEXT: s_cbranch_scc0 .LBB0_4 ; LOOP-NEXT: ; %bb.1: ; %copy_forward ; LOOP-NEXT: s_mov_b64 s[6:7], 0 ; LOOP-NEXT: s_mov_b32 s2, 0 @@ -32,10 +33,16 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src ; LOOP-NEXT: s_waitcnt vmcnt(0) ; LOOP-NEXT: buffer_store_byte v8, v[6:7], s[0:3], 0 addr64 ; LOOP-NEXT: s_cbranch_vccnz .LBB0_2 -; LOOP-NEXT: .LBB0_3: ; %Flow17 -; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[4:5] -; LOOP-NEXT: s_cbranch_execz .LBB0_6 -; LOOP-NEXT: ; %bb.4: ; %copy_backwards +; LOOP-NEXT: ; %bb.3: ; %Flow +; LOOP-NEXT: ; implicit-def: $vgpr0 +; LOOP-NEXT: ; implicit-def: $vgpr2 +; LOOP-NEXT: s_or_b64 exec, exec, s[4:5] +; LOOP-NEXT: .LBB0_4: ; %Flow17 +; LOOP-NEXT: s_xor_b64 s[0:1], s[4:5], exec +; LOOP-NEXT: s_and_b64 s[0:1], s[4:5], -1 +; LOOP-NEXT: s_cmov_b64 exec, s[4:5] +; LOOP-NEXT: s_cbranch_scc0 .LBB0_7 +; LOOP-NEXT: ; %bb.5: ; %copy_backwards ; LOOP-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; LOOP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; LOOP-NEXT: v_add_i32_e32 v2, vcc, 3, v2 @@ -45,7 +52,7 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src ; LOOP-NEXT: s_mov_b32 s7, 0xf000 ; LOOP-NEXT: s_mov_b64 s[4:5], 0 ; LOOP-NEXT: v_mov_b32_e32 v4, s0 -; LOOP-NEXT: .LBB0_5: ; %copy_backwards_loop +; LOOP-NEXT: .LBB0_6: ; %copy_backwards_loop ; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 ; LOOP-NEXT: s_waitcnt expcnt(0) ; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[4:7], 0 addr64 @@ -57,8 +64,8 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src ; LOOP-NEXT: v_addc_u32_e64 v1, s[0:1], -1, v1, s[0:1] ; LOOP-NEXT: v_add_i32_e64 v2, s[0:1], -1, v2 ; LOOP-NEXT: v_addc_u32_e64 v3, s[0:1], -1, v3, s[0:1] -; LOOP-NEXT: s_cbranch_vccz .LBB0_5 -; LOOP-NEXT: .LBB0_6: ; %memmove_done +; LOOP-NEXT: s_cbranch_vccz .LBB0_6 +; LOOP-NEXT: .LBB0_7: ; %memmove_done ; LOOP-NEXT: s_endpgm ; ; UNROLL-LABEL: memmove_p1i8: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 36bac87889cac..00972898d5458 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -171,16 +171,12 @@ define void @localize_internal_globals(i1 %cond) { ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB2_3 -; GFX9-NEXT: ; %bb.1: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB2_4 -; GFX9-NEXT: .LBB2_2: ; %bb2 -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB2_3: ; %bb1 +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; %bb1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, static.gv2@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv2@rel32@hi+12 @@ -193,22 +189,27 @@ define void @localize_internal_globals(i1 %cond) { ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: .LBB2_4: ; %bb0 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, static.gv0@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, static.gv0@rel32@hi+12 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB2_2: ; %Flow +; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; %bb0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, static.gv0@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, static.gv0@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_store_dword v0, v0, s[6:7] +; GFX9-NEXT: global_store_dword v0, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, static.gv1@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, static.gv1@rel32@hi+12 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, static.gv1@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, static.gv1@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB2_4: ; %bb2 ; GFX9-NEXT: s_setpc_b64 s[30:31] entry: br i1 %cond, label %bb0, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 1140ef88ac7f8..5e5fd009c2a86 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -508,24 +508,28 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB10_2 +; GFX10-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX10-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX10-NEXT: ; %bb.1: ; %else ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, v2, v4, 0 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, v2, v5, v[1:2] ; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: .LBB10_2: ; %Flow -; GFX10-NEXT: s_andn2_saveexec_b32 s0, s0 -; GFX10-NEXT: s_cbranch_execz .LBB10_4 +; GFX10-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-NEXT: s_and_b32 s2, s0, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX10-NEXT: ; %bb.3: ; %if ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v2, v5 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: .LBB10_4: ; %endif -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm @@ -540,12 +544,13 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[2:3], v0, s[6:7] ; GFX11-NEXT: global_load_b64 v[4:5], v0, s[0:1] -; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3] -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 @@ -554,15 +559,19 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_mov_b32_e32 v1, v3 ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: .LBB10_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX11-NEXT: s_and_b32 s2, s0, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, s0 +; GFX11-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX11-NEXT: ; %bb.3: ; %if ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: .LBB10_4: ; %endif -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index eaaeb3dc77a41..528110d2e6ae2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -148,37 +148,43 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; GCN-LABEL: func_non_entry_block_static_alloca_align4: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, s33 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB2_3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB2_4 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN-NEXT: s_and_b64 exec, exec, vcc -; GCN-NEXT: s_cbranch_execz .LBB2_3 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_and_b64 s[8:9], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB2_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_add_u32 s6, s32, 0x1000 +; GCN-NEXT: s_add_u32 s8, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s8 ; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, 1 ; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v4 -; GCN-NEXT: v_add_u32_e32 v2, s6, v2 +; GCN-NEXT: v_add_u32_e32 v2, s8, v2 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: .LBB2_3: ; %bb.2 +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB2_3: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB2_4: ; %bb.2 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s7 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_setpc_b64 s[30:31] entry: @@ -211,13 +217,15 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; GCN-LABEL: func_non_entry_block_static_alloca_align64: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0xfc0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000 ; GCN-NEXT: s_addk_i32 s32, 0x2000 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_add_u32 s6, s32, 0x1000 ; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 @@ -233,13 +241,13 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: .LBB3_2: ; %bb.1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB3_2: ; %bb.1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_addk_i32 s32, 0xe000 -; GCN-NEXT: s_mov_b32 s33, s7 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %arg.cond, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir index 0ef5aaea3b149..d0959fdb5bfad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir @@ -86,14 +86,12 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY2]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .3: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .4: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr0 @@ -142,14 +140,12 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY1]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .3: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .4: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll index 61263e0efa2ea..30d7a863c3956 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll @@ -147,14 +147,12 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; FAST-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.4: ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.5: ; FAST-NEXT: [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) @@ -209,14 +207,12 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GREEDY-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) @@ -278,14 +274,12 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; FAST-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.4: ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.5: ; FAST-NEXT: [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) @@ -341,14 +335,12 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GREEDY-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1) ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll index d6a7ae8d867fe..f792be6a0c2b8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll @@ -168,14 +168,12 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; FAST-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.4: ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.5: ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; FAST-NEXT: S_ENDPGM 0 @@ -234,14 +232,12 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GREEDY-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 @@ -298,14 +294,12 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; FAST-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.4: ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.5: ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; FAST-NEXT: S_ENDPGM 0 @@ -356,14 +350,12 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GREEDY-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 @@ -440,14 +432,12 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; FAST-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; FAST-NEXT: {{ $}} ; FAST-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; FAST-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; FAST-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; FAST-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; FAST-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.4: ; FAST-NEXT: successors: %bb.5(0x80000000) ; FAST-NEXT: {{ $}} - ; FAST-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; FAST-NEXT: {{ $}} ; FAST-NEXT: bb.5: ; FAST-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; FAST-NEXT: S_ENDPGM 0 @@ -518,14 +508,12 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; GREEDY-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8) - ; GREEDY-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GREEDY-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GREEDY-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GREEDY-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.4: ; GREEDY-NEXT: successors: %bb.5(0x80000000) ; GREEDY-NEXT: {{ $}} - ; GREEDY-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; GREEDY-NEXT: {{ $}} ; GREEDY-NEXT: bb.5: ; GREEDY-NEXT: G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; GREEDY-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll index 8c7bdb867d168..b7b1eed0a32fd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll @@ -88,14 +88,12 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -135,14 +133,12 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -194,14 +190,12 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll index 19793f7020dc8..aa572be3541a1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll @@ -88,14 +88,12 @@ define amdgpu_ps float @raw_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -135,14 +133,12 @@ define amdgpu_ps float @raw_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -194,14 +190,12 @@ define amdgpu_ps float @raw_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll index ccf3647060e42..d431e7affeb6f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -1640,11 +1640,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1686,11 +1685,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg % ; GFX12-NEXT: successors: %bb.4, %bb.2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1738,11 +1736,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1785,11 +1782,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> % ; GFX12-NEXT: successors: %bb.4, %bb.2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1839,11 +1835,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1886,11 +1881,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> % ; GFX12-NEXT: successors: %bb.4, %bb.2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4096, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1938,11 +1932,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -1984,11 +1977,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) ; GFX12-NEXT: successors: %bb.4, %bb.2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -2035,11 +2027,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX7-NEXT: successors: %bb.4, %bb.2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -2081,11 +2072,10 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) ; GFX12-NEXT: successors: %bb.4, %bb.2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4096, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4096) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) @@ -2135,11 +2125,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2192,11 +2181,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> % ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2257,11 +2245,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2314,11 +2301,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> % ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4068, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4084, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2377,11 +2363,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2434,11 +2419,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> % ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4096, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4112, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2496,11 +2480,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2554,11 +2537,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 5000, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 5016, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2616,11 +2598,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2674,11 +2655,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4076, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2736,11 +2716,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2794,11 +2773,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4096, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2855,11 +2833,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX7-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX7-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.4: - ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: bb.5: ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) @@ -2911,11 +2888,10 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.4: - ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]] ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.5: ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll index f664e62761ad5..287b3847abe44 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll @@ -86,14 +86,12 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[COPY6]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -132,14 +130,12 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -190,14 +186,12 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll index 0f72586ed6c12..c74bc81551000 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll @@ -88,14 +88,12 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[COPY7]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -134,14 +132,12 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -192,14 +188,12 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable store (s32), align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll index b835b3a3e380b..673c46c25d9fd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll @@ -86,14 +86,12 @@ define amdgpu_ps float @struct_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[COPY6]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -132,14 +130,12 @@ define amdgpu_ps float @struct_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -190,14 +186,12 @@ define amdgpu_ps float @struct_ptr_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll index 0cefc373dd7cd..35c1327dcd7c1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll @@ -88,14 +88,12 @@ define amdgpu_ps void @struct_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[COPY7]], 0, 0, -1 :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -134,14 +132,12 @@ define amdgpu_ps void @struct_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_]], 0, 0, -1 :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -192,14 +188,12 @@ define amdgpu_ps void @struct_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[V_READFIRSTLANE_B32_4]], 0, 0, -1 :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir index 23383f27efce7..336c3df946220 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir @@ -40,14 +40,12 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: G_AMDGPU_BUFFER_STORE %val(s32), %rsrc(<4 x s32>), [[COPY]](s32), %voffset, [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable store (s32), addrspace 4) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .3: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .4: ; CHECK-NEXT: S_ENDPGM 0 %val:_(s32) = COPY $vgpr0 @@ -108,14 +106,12 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY1]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (<4 x s32>)) - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .3: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] - ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: .4: ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) %0:_(<8 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 377fa24cb4755..5d5bbdaa765f1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -14,16 +14,11 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB0_4 -; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 ; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc @@ -159,9 +154,13 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB0_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -182,7 +181,8 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, %den ret i64 %result @@ -654,11 +654,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_mov_b32_e32 v8, v2 +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec ; CGP-NEXT: v_mov_b32_e32 v9, v3 +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_2 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 @@ -793,9 +794,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB2_2: ; %Flow1 -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_4 +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 @@ -817,22 +821,17 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] ; CGP-NEXT: .LBB2_4: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB2_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB2_8 -; CGP-NEXT: .LBB2_6: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB2_7: +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_6 +; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 ; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v2, vcc @@ -966,9 +965,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_6 -; CGP-NEXT: .LBB2_8: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB2_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_8 +; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -989,7 +992,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] +; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, %den ret <2 x i64> %result @@ -1661,16 +1665,11 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB7_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB7_4 -; CHECK-NEXT: .LBB7_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB7_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 ; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v6, v0, vcc @@ -1804,9 +1803,13 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 -; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB7_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB7_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -1827,7 +1830,8 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y %r = sdiv i64 %x, %shl.y @@ -2113,23 +2117,24 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_mov_b32_e32 v5, v2 -; CGP-NEXT: v_mov_b32_e32 v7, v3 +; CGP-NEXT: v_mov_b32_e32 v9, v3 ; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: v_lshl_b64 v[11:12], v[2:3], v4 -; CGP-NEXT: v_mov_b32_e32 v9, v1 -; CGP-NEXT: v_mov_b32_e32 v8, v0 -; CGP-NEXT: v_or_b32_e32 v1, v9, v12 +; CGP-NEXT: v_lshl_b64 v[12:13], v[2:3], v4 +; CGP-NEXT: v_mov_b32_e32 v8, v1 +; CGP-NEXT: v_mov_b32_e32 v7, v0 +; CGP-NEXT: v_or_b32_e32 v1, v8, v13 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_2 +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v12 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v0 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v12, v0, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v13 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v0 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v13, v0, vcc ; CGP-NEXT: v_xor_b32_e32 v4, v1, v0 ; CGP-NEXT: v_xor_b32_e32 v1, v10, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 @@ -2172,276 +2177,275 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc ; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 ; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v8, v14 -; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v7, v14 +; CGP-NEXT: v_mul_lo_u32 v7, v16, v10 ; CGP-NEXT: v_mul_lo_u32 v15, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v17, v9, v14 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v10 +; CGP-NEXT: v_xor_b32_e32 v17, v8, v14 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v16, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v8, v16, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 ; CGP-NEXT: v_mul_hi_u32 v15, v13, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v17, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v12, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v17, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v17, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v17, v7 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v8 +; CGP-NEXT: v_mul_hi_u32 v13, v12, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v17, v7 +; CGP-NEXT: v_mul_hi_u32 v15, v17, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v11, v12, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v7, v10 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, 0 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v10 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v17, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v9 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v15, v[8:9] +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v13, v[10:11] +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v17, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v17, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 +; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v7, v4 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, v12, v15, s[4:5] -; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v1 +; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc +; CGP-NEXT: v_cndmask_b32_e64 v8, v11, v12, s[4:5] +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v13 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v15, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v1 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v1 ; CGP-NEXT: v_cndmask_b32_e32 v1, v16, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v8, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v7, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v8, v14, v0 -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v1, v8 -; CGP-NEXT: v_xor_b32_e32 v1, v4, v8 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: v_xor_b32_e32 v7, v14, v0 +; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v1, v7 +; CGP-NEXT: v_xor_b32_e32 v1, v4, v7 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; CGP-NEXT: ; implicit-def: $vgpr12_vgpr13 +; CGP-NEXT: ; implicit-def: $vgpr7 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB8_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: v_lshl_b64 v[9:10], v[2:3], v6 -; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_4 +; CGP-NEXT: v_lshl_b64 v[10:11], v[2:3], v6 +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_4 ; CGP-NEXT: ; %bb.3: -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v11 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v11 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v12 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v12 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v11 +; CGP-NEXT: v_mul_hi_u32 v0, v7, v0 +; CGP-NEXT: v_mul_lo_u32 v1, v0, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v11 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v7, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v11 +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v12 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v11 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] ; CGP-NEXT: .LBB8_4: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_or_b32_e32 v3, v7, v10 +; CGP-NEXT: v_or_b32_e32 v3, v9, v11 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB8_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB8_8 -; CGP-NEXT: .LBB8_6: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB8_7: -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v10, v2, vcc +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_6 +; CGP-NEXT: ; %bb.5: +; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v11 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v2 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v11, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v4, v3, v2 ; CGP-NEXT: v_xor_b32_e32 v3, v6, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 +; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v10, v8 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v6, v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v16, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 +; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; CGP-NEXT: v_trunc_f32_e32 v8, v7 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v8 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[7:8] +; CGP-NEXT: v_mul_hi_u32 v14, v10, v6 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[7:8] +; CGP-NEXT: v_mul_lo_u32 v8, v13, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v7 +; CGP-NEXT: v_mul_lo_u32 v16, v13, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v7 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v16, v6 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v6 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[7:8] +; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc +; CGP-NEXT: v_xor_b32_e32 v9, v5, v11 +; CGP-NEXT: v_mul_lo_u32 v5, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v7 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v5, v12 -; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v14, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v11, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v9 +; CGP-NEXT: v_mul_lo_u32 v14, v13, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v10, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v10, v9, v6 +; CGP-NEXT: v_mul_hi_u32 v12, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v10, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0 +; CGP-NEXT: v_mul_lo_u32 v12, v8, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 ; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 ; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v12, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v11, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc -; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; CGP-NEXT: ; implicit-def: $vgpr5 -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_6 -; CGP-NEXT: .LBB8_8: -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB8_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_8 +; CGP-NEXT: ; %bb.7: +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -2449,18 +2453,19 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v9 +; CGP-NEXT: v_mul_lo_u32 v3, v2, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v9 +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v10 ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] +; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y %r = sdiv <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 83ebc84e1f84a..b92b2c040ae67 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -14,16 +14,11 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB0_4 -; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v3 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v2, v1 ; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v1, vcc @@ -155,9 +150,13 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB0_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -177,6 +176,7 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, %den ret i64 %result @@ -640,11 +640,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_mov_b32_e32 v8, v2 +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec ; CGP-NEXT: v_mov_b32_e32 v9, v3 +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_2 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v1 @@ -777,9 +778,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB2_2: ; %Flow1 -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_4 +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 @@ -799,22 +803,17 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 -; CGP-NEXT: .LBB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB2_4: ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB2_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB2_8 -; CGP-NEXT: .LBB2_6: -; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB2_7: +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_6 +; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v7 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v3 ; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v3, vcc @@ -946,9 +945,13 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_6 -; CGP-NEXT: .LBB2_8: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB2_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_8 +; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -968,6 +971,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, %den ret <2 x i64> %result @@ -2176,16 +2180,11 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB7_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB7_4 -; CHECK-NEXT: .LBB7_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB7_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v1 ; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v1, vcc @@ -2319,9 +2318,13 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 -; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB7_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB7_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2341,6 +2344,7 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y %r = srem i64 %x, %shl.y @@ -2622,23 +2626,24 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_mov_b32_e32 v5, v2 -; CGP-NEXT: v_mov_b32_e32 v7, v3 +; CGP-NEXT: v_mov_b32_e32 v9, v3 ; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: v_lshl_b64 v[11:12], v[2:3], v4 -; CGP-NEXT: v_mov_b32_e32 v9, v1 -; CGP-NEXT: v_mov_b32_e32 v8, v0 -; CGP-NEXT: v_or_b32_e32 v1, v9, v12 +; CGP-NEXT: v_lshl_b64 v[12:13], v[2:3], v4 +; CGP-NEXT: v_mov_b32_e32 v8, v1 +; CGP-NEXT: v_mov_b32_e32 v7, v0 +; CGP-NEXT: v_or_b32_e32 v1, v8, v13 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_2 +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v12 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v11, v1 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v12, v1, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v1 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v13, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v1 ; CGP-NEXT: v_xor_b32_e32 v1, v4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 @@ -2683,78 +2688,78 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v11 ; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v14 +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v14 ; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v8, v14, vcc ; CGP-NEXT: v_xor_b32_e32 v12, v4, v14 ; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v9, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v15, v8, v14 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v10 +; CGP-NEXT: v_mul_lo_u32 v8, v13, v11 +; CGP-NEXT: v_xor_b32_e32 v15, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 +; CGP-NEXT: v_mul_lo_u32 v7, v16, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v15, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v12, v8 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v16, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v8, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v12, v7 ; CGP-NEXT: v_mul_hi_u32 v11, v12, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v15, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v15, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v15, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v13, 0 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v4 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v15, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v15, v9 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v4 +; CGP-NEXT: v_mov_b32_e32 v4, v8 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v10, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v7 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v13, v[10:11] +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v15, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v15, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v1 -; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 +; CGP-NEXT: v_subb_u32_e32 v8, vcc, v8, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v11, vcc, v4, v0 -; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v9, vcc +; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v8, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v1 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -2763,156 +2768,153 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v14 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v14 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc -; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: ; implicit-def: $vgpr12_vgpr13 +; CGP-NEXT: ; implicit-def: $vgpr7 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB8_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: v_lshl_b64 v[9:10], v[2:3], v6 -; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_4 +; CGP-NEXT: v_lshl_b64 v[10:11], v[2:3], v6 +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_4 ; CGP-NEXT: ; %bb.3: -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v11 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v11 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v12 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v12 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v11 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v11 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v11 +; CGP-NEXT: v_mul_hi_u32 v0, v7, v0 +; CGP-NEXT: v_mul_lo_u32 v0, v0, v12 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v7, v0 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v12 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v11 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v11 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v12 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 -; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_or_b32_e32 v3, v7, v10 +; CGP-NEXT: .LBB8_4: +; CGP-NEXT: v_or_b32_e32 v3, v9, v11 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB8_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB8_8 -; CGP-NEXT: .LBB8_6: -; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB8_7: -; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v10 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v3, vcc +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_6 +; CGP-NEXT: ; %bb.5: +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v11 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v2, v3 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 +; CGP-NEXT: v_trunc_f32_e32 v8, v6 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v8 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v13, v6 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[7:8] +; CGP-NEXT: v_mul_hi_u32 v8, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v14, v10, v7 +; CGP-NEXT: v_mul_lo_u32 v15, v13, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v10, v7 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v4 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v11 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v9, v11, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v4, v11 +; CGP-NEXT: v_mul_lo_u32 v4, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v10, v7 +; CGP-NEXT: v_xor_b32_e32 v12, v5, v11 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 +; CGP-NEXT: v_mul_lo_u32 v5, v13, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v10, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v13, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v12, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v8, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v12, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 @@ -2925,11 +2927,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc @@ -2937,17 +2939,21 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc -; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v11 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v11 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; CGP-NEXT: ; implicit-def: $vgpr5 -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_6 -; CGP-NEXT: .LBB8_8: -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v9 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB8_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_8 +; CGP-NEXT: ; %bb.7: +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -2955,16 +2961,17 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v2, v2, v9 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v10 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v9 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v9 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y %r = srem <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index d15551365707b..47545b015b8f8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -13,18 +13,13 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB0_4 -; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v3 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc @@ -152,9 +147,13 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: ; implicit-def: $vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB0_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v6 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -174,7 +173,8 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %num, %den ret i64 %result @@ -627,11 +627,12 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_or_b32_e32 v1, v11, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v4 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_2 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 @@ -760,9 +761,12 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr2 ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB2_2: ; %Flow1 -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_4 +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v2 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 @@ -783,23 +787,18 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] ; CGP-NEXT: .LBB2_4: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6 ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB2_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB2_8 -; CGP-NEXT: .LBB2_6: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB2_7: +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_6 +; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v7 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc @@ -927,9 +926,13 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_6 -; CGP-NEXT: .LBB2_8: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB2_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_8 +; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 @@ -949,7 +952,8 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] +; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i64> %num, %den ret <2 x i64> %result @@ -1072,22 +1076,17 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_mov_b32_e32 v4, v1 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_mov_b32_e32 v7, 0 ; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2 -; CHECK-NEXT: v_or_b32_e32 v8, v4, v6 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] +; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v5 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB7_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB7_4 -; CHECK-NEXT: .LBB7_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB7_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v6 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v6, vcc @@ -1215,9 +1214,13 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 -; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB7_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB7_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -1237,7 +1240,8 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y %r = udiv i64 %x, %shl.y @@ -1513,15 +1517,16 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mov_b32_e32 v7, v3 ; CGP-NEXT: v_mov_b32_e32 v10, 0x1000 ; CGP-NEXT: v_mov_b32_e32 v11, 0 -; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_lshl_b64 v[2:3], v[10:11], v4 ; CGP-NEXT: v_or_b32_e32 v1, v9, v3 +; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_2 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 @@ -1650,11 +1655,13 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB8_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 ; CGP-NEXT: v_lshl_b64 v[9:10], v[10:11], v6 -; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_4 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 @@ -1675,23 +1682,18 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] ; CGP-NEXT: .LBB8_4: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: v_or_b32_e32 v3, v7, v10 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v9 ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB8_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB8_8 -; CGP-NEXT: .LBB8_6: -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB8_7: +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_6 +; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v10, vcc @@ -1819,9 +1821,13 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 -; CGP-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_6 -; CGP-NEXT: .LBB8_8: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB8_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; CGP-NEXT: s_and_b64 s[4:5], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_8 +; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 @@ -1841,7 +1847,8 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 -; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: s_or_b64 exec, exec, s[8:9] +; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y %r = udiv <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index cc0f7e2ca5a54..5311585bfaa9e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -13,18 +13,13 @@ define i64 @v_urem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB0_4 -; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v3 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc @@ -151,9 +146,13 @@ define i64 @v_urem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: ; implicit-def: $vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB0_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v6 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -172,6 +171,7 @@ define i64 @v_urem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %num, %den ret i64 %result @@ -619,11 +619,12 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_or_b32_e32 v1, v11, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v4 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB2_2 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v5 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 @@ -751,9 +752,12 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr2 ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB2_2: ; %Flow1 -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_4 +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v2 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 @@ -772,23 +776,18 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 -; CGP-NEXT: .LBB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB2_4: ; CGP-NEXT: v_or_b32_e32 v3, v9, v7 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6 ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB2_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB2_8 -; CGP-NEXT: .LBB2_6: -; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB2_7: +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB2_6 +; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v7 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc @@ -915,9 +914,13 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB2_6 -; CGP-NEXT: .LBB2_8: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB2_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB2_8 +; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 @@ -936,6 +939,7 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, %den ret <2 x i64> %result @@ -1501,22 +1505,17 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_mov_b32_e32 v4, v1 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_mov_b32_e32 v7, 0 ; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2 -; CHECK-NEXT: v_or_b32_e32 v8, v4, v6 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] +; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v5 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB7_3 -; CHECK-NEXT: ; %bb.1: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execnz .LBB7_4 -; CHECK-NEXT: .LBB7_2: -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB7_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v6 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v6, vcc @@ -1643,9 +1642,13 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 -; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB7_2: ; %Flow +; CHECK-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB7_4 +; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -1664,6 +1667,7 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y %r = urem i64 %x, %shl.y @@ -1937,15 +1941,16 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mov_b32_e32 v7, v3 ; CGP-NEXT: v_mov_b32_e32 v10, 0x1000 ; CGP-NEXT: v_mov_b32_e32 v11, 0 -; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_lshl_b64 v[2:3], v[10:11], v4 ; CGP-NEXT: v_or_b32_e32 v1, v9, v3 +; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_2 +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_2 ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 @@ -2073,11 +2078,13 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: .LBB8_2: ; %Flow1 -; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; CGP-NEXT: v_lshl_b64 v[9:10], v[10:11], v6 -; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] -; CGP-NEXT: s_cbranch_execz .LBB8_4 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 @@ -2096,23 +2103,18 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 -; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB8_4: ; CGP-NEXT: v_or_b32_e32 v3, v7, v10 ; CGP-NEXT: v_mov_b32_e32 v2, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: s_xor_b64 s[6:7], vcc, exec +; CGP-NEXT: s_and_b64 s[4:5], vcc, -1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v9 ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; CGP-NEXT: s_cbranch_execnz .LBB8_7 -; CGP-NEXT: ; %bb.5: ; %Flow -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execnz .LBB8_8 -; CGP-NEXT: .LBB8_6: -; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: s_setpc_b64 s[30:31] -; CGP-NEXT: .LBB8_7: +; CGP-NEXT: s_cmov_b64 exec, vcc +; CGP-NEXT: s_cbranch_scc0 .LBB8_6 +; CGP-NEXT: ; %bb.5: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v10, vcc @@ -2239,9 +2241,13 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 -; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; CGP-NEXT: s_cbranch_execz .LBB8_6 -; CGP-NEXT: .LBB8_8: +; CGP-NEXT: s_or_b64 exec, exec, s[6:7] +; CGP-NEXT: .LBB8_6: ; %Flow +; CGP-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CGP-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CGP-NEXT: s_cmov_b64 exec, s[6:7] +; CGP-NEXT: s_cbranch_scc0 .LBB8_8 +; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 @@ -2260,6 +2266,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] +; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y %r = urem <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll index 9d4f9434aa314..ca3045fc8b2a1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn--amdpal -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX8 -enable-var-scope %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX9 -enable-var-scope %s ; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mattr=-xnack -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL,GFX9 -enable-var-scope %s diff --git a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll index e03c9ca34b825..fa7445a15ca9a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll @@ -13,31 +13,41 @@ define amdgpu_ps void @main(i32 %arg) { ; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_branch .LBB0_2 -; GFX10-NEXT: .LBB0_1: ; in Loop: Header=BB0_2 Depth=1 +; GFX10-NEXT: s_branch .LBB0_3 +; GFX10-NEXT: .LBB0_1: ; %Flow +; GFX10-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-NEXT: .LBB0_2: ; in Loop: Header=BB0_3 Depth=1 ; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo ; GFX10-NEXT: s_or_b32 s2, s0, s2 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execz .LBB0_5 -; GFX10-NEXT: .LBB0_2: ; %bb4 +; GFX10-NEXT: s_andn2_b32 s0, exec_lo, s2 +; GFX10-NEXT: s_and_b32 s3, s0, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s0, s2 +; GFX10-NEXT: s_cbranch_scc0 .LBB0_6 +; GFX10-NEXT: .LBB0_3: ; %bb4 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_and_saveexec_b32 s3, s1 -; GFX10-NEXT: s_cbranch_execz .LBB0_1 -; GFX10-NEXT: ; %bb.3: ; in Loop: Header=BB0_2 Depth=1 +; GFX10-NEXT: s_and_b32 s0, s1, exec_lo +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_and_b32 s5, s0, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB0_2 +; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB0_3 Depth=1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX10-NEXT: s_mov_b32 s8, exec_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB0_1 -; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB0_2 Depth=1 +; GFX10-NEXT: s_and_b32 s5, s0, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB0_1 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB0_3 Depth=1 ; GFX10-NEXT: s_mov_b32 s5, s4 ; GFX10-NEXT: s_mov_b32 s6, s4 ; GFX10-NEXT: s_mov_b32 s7, s4 ; GFX10-NEXT: buffer_atomic_and v0, off, s[4:7], 0 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_branch .LBB0_1 -; GFX10-NEXT: .LBB0_5: ; %bb8 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: .LBB0_6: ; %bb8 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 624101dc12c5f..4a782dcc89fef 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -20,12 +20,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX6-NEXT: s_cbranch_execz .LBB0_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -33,8 +35,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -51,9 +53,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -61,8 +65,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -79,9 +83,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -89,8 +95,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -103,12 +109,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-LABEL: add_i32_constant: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -116,9 +124,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -131,11 +139,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -143,9 +153,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -163,8 +173,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -173,8 +185,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -194,8 +206,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -204,8 +218,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -226,8 +240,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -236,8 +252,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -257,8 +273,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -267,8 +285,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -290,13 +308,15 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX6-NEXT: s_cbranch_execz .LBB1_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -304,8 +324,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -319,14 +339,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -334,8 +356,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -349,14 +371,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -364,8 +388,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -380,12 +404,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX10W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -393,9 +419,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -409,11 +435,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX10W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -421,9 +449,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -442,8 +470,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -452,8 +482,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -474,8 +504,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -484,8 +516,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -507,8 +539,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -517,8 +551,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -539,8 +573,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -549,8 +585,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -601,17 +637,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -642,17 +679,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -682,17 +720,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -721,17 +760,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -763,17 +803,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -804,19 +844,20 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -850,17 +891,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX12W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -891,19 +932,20 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX12W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -956,10 +998,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB3_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 @@ -967,8 +1010,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB3_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB3_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -999,10 +1042,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 @@ -1010,8 +1054,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -1041,9 +1085,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10W64-NEXT: s_cbranch_execz .LBB3_4 +; GFX10W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_clause 0x1 ; GFX10W64-NEXT: s_load_dword s5, s[0:1], 0x44 @@ -1052,9 +1097,9 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mov_b32_e32 v2, s5 ; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB3_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB3_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -1083,9 +1128,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX10W32-NEXT: s_cbranch_execz .LBB3_4 +; GFX10W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_clause 0x1 ; GFX10W32-NEXT: s_load_dword s8, s[0:1], 0x44 @@ -1094,9 +1140,9 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mov_b32_e32 v2, s8 ; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB3_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB3_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -1128,10 +1174,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11W64-NEXT: s_cbranch_execz .LBB3_4 +; GFX11W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_clause 0x1 ; GFX11W64-NEXT: s_load_b32 s5, s[0:1], 0x44 @@ -1140,8 +1186,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mov_b32_e32 v2, s5 ; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB3_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB3_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -1172,12 +1218,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX11W32-NEXT: s_cbranch_execz .LBB3_4 +; GFX11W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_clause 0x1 ; GFX11W32-NEXT: s_load_b32 s8, s[0:1], 0x44 @@ -1186,8 +1233,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: v_mov_b32_e32 v2, s8 ; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB3_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB3_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -1221,10 +1268,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX12W64-NEXT: s_cbranch_execz .LBB3_4 +; GFX12W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_clause 0x1 ; GFX12W64-NEXT: s_load_b32 s5, s[0:1], 0x44 @@ -1233,8 +1280,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, s5 ; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB3_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB3_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -1265,12 +1312,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX12W32-NEXT: s_cbranch_execz .LBB3_4 +; GFX12W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_clause 0x1 ; GFX12W32-NEXT: s_load_b32 s8, s[0:1], 0x44 @@ -1279,8 +1327,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mov_b32_e32 v2, s8 ; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB3_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB3_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -1387,12 +1435,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX6-NEXT: s_cbranch_execz .LBB5_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1400,8 +1450,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1419,9 +1469,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB5_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1429,8 +1481,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -1448,9 +1500,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1458,8 +1512,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -1473,12 +1527,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-LABEL: sub_i32_constant: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1486,9 +1542,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1502,11 +1558,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -1514,9 +1572,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1535,8 +1593,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1545,8 +1605,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1567,8 +1627,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -1577,8 +1639,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1600,8 +1662,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1610,8 +1674,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1632,8 +1696,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -1642,8 +1708,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1666,13 +1732,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX6-NEXT: s_cbranch_execz .LBB6_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1680,8 +1748,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB6_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: .LBB6_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1695,14 +1763,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB6_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1710,8 +1780,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1725,14 +1795,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1740,8 +1812,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1756,12 +1828,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX10W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1769,9 +1843,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1786,11 +1860,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX10W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1798,9 +1874,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1820,8 +1896,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1830,8 +1908,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB6_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB6_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1853,8 +1931,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1863,8 +1943,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W32-NEXT: .LBB6_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB6_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1887,8 +1967,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1897,8 +1979,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB6_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB6_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1920,8 +2002,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1930,8 +2014,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB6_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB6_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1983,17 +2067,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB7_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -2024,17 +2109,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2064,17 +2150,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -2103,17 +2190,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -2145,17 +2233,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -2186,19 +2274,20 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB7_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB7_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -2233,17 +2322,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX12W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB7_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB7_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -2274,19 +2363,20 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX12W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB7_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB7_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 8ee0ee3b27bae..9051b11722573 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -17,13 +17,15 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -36,8 +38,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB0_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: .LBB0_2: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -48,14 +50,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX89-LABEL: add_i32_constant: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b64 s[6:7], exec +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX89-NEXT: s_mov_b64 s[4:5], exec +; GFX89-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX89-NEXT: ; implicit-def: $vgpr1 -; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX89-NEXT: s_cbranch_execz .LBB0_2 +; GFX89-NEXT: s_cmov_b64 exec, vcc +; GFX89-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX89-NEXT: ; %bb.1: ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s8, s2 @@ -68,8 +72,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: .LBB0_2: ; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX89-NEXT: .LBB0_2: ; GFX89-NEXT: v_readfirstlane_b32 s4, v1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s3, 0xf000 @@ -82,12 +86,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_2 +; GFX1064-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 @@ -101,9 +107,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -116,11 +122,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_2 +; GFX1032-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 @@ -134,9 +142,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -154,8 +162,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 @@ -169,8 +179,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -190,8 +200,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -205,8 +217,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -227,8 +239,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1264-NEXT: s_cbranch_execz .LBB0_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 @@ -241,8 +255,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB0_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-NEXT: .LBB0_2: ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 @@ -262,8 +276,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1232-NEXT: s_cbranch_execz .LBB0_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 @@ -276,8 +292,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB0_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1232-NEXT: .LBB0_2: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 @@ -297,65 +313,69 @@ entry: define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) { ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[8:9] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s8, s6 +; GFX7LESS-NEXT: s_mov_b32 s9, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: .LBB1_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[8:9] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s2, s8, s2 +; GFX8-NEXT: s_mul_i32 s1, s0, s1 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 @@ -365,31 +385,33 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_load_dword s10, s[0:1], 0x34 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s2, s8, s2 +; GFX9-NEXT: s_mul_i32 s0, s10, s0 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s10, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -401,35 +423,37 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX1064-NEXT: s_load_dword s10, s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB1_2 +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s2, s8, s2 +; GFX1064-NEXT: s_mul_i32 s0, s10, s0 ; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-NEXT: s_mov_b32 s12, s6 ; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v0, s[0:1] ; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; @@ -438,28 +462,30 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB1_2 +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: s_mul_i32 s0, s2, s0 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: s_mov_b32 s8, s6 ; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 @@ -472,36 +498,38 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1164-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1164-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB1_2 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[10:11], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 +; GFX1164-NEXT: s_bcnt1_i32_b64 s1, s[8:9] +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s2, s8, s2 -; GFX1164-NEXT: s_mov_b32 s14, -1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc +; GFX1164-NEXT: s_mul_i32 s1, s0, s1 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s1 +; GFX1164-NEXT: s_mov_b32 s8, s6 +; GFX1164-NEXT: s_mov_b32 s9, s7 +; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: .LBB1_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s6, -1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] +; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] ; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -512,28 +540,30 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB1_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_mul_i32 s1, s0, s1 ; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132-NEXT: s_mov_b32 s8, s6 ; GFX1132-NEXT: s_mov_b32 s9, s7 ; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: .LBB1_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 @@ -549,35 +579,37 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 ; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1264-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1264-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b64 s[2:3], exec -; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1264-NEXT: s_cbranch_execz .LBB1_2 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264-NEXT: s_and_b64 s[10:11], vcc, -1 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s1, s[8:9] +; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_i32 s2, s8, s2 -; GFX1264-NEXT: s_mov_b32 s14, -1 -; GFX1264-NEXT: v_mov_b32_e32 v1, s2 -; GFX1264-NEXT: s_mov_b32 s12, s6 -; GFX1264-NEXT: s_mov_b32 s13, s7 -; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: s_mul_i32 s1, s0, s1 +; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: v_mov_b32_e32 v1, s1 +; GFX1264-NEXT: s_mov_b32 s8, s6 +; GFX1264-NEXT: s_mov_b32 s9, s7 +; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV +; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-NEXT: .LBB1_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s8, v0, s[0:1] +; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[2:3] ; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1264-NEXT: s_nop 0 ; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -588,27 +620,29 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_clause 0x1 ; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1232-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1232-NEXT: s_mov_b32 s3, exec_lo ; GFX1232-NEXT: s_mov_b32 s2, exec_lo -; GFX1232-NEXT: s_mov_b32 s1, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1232-NEXT: s_cbranch_execz .LBB1_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1232-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mul_i32 s2, s0, s2 +; GFX1232-NEXT: s_mul_i32 s1, s0, s1 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: v_mov_b32_e32 v1, s2 +; GFX1232-NEXT: v_mov_b32_e32 v1, s1 ; GFX1232-NEXT: s_mov_b32 s8, s6 ; GFX1232-NEXT: s_mov_b32 s9, s7 ; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1232-NEXT: .LBB1_2: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 @@ -665,10 +699,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 @@ -679,8 +714,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -710,10 +745,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 @@ -724,8 +760,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -755,9 +791,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 @@ -769,9 +806,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -800,9 +837,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1032-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032-NEXT: s_xor_b32 s5, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 @@ -814,9 +852,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -848,10 +886,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 @@ -863,8 +901,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB2_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB2_4: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -895,12 +933,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1132-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132-NEXT: s_xor_b32 s5, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -912,8 +951,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB2_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132-NEXT: .LBB2_4: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -948,10 +987,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0 -; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1264-NEXT: s_cbranch_execz .LBB2_4 +; GFX1264-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1264-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1264-NEXT: ; %bb.3: ; GFX1264-NEXT: v_mov_b32_e32 v0, s6 ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 @@ -962,8 +1001,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB2_4: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-NEXT: .LBB2_4: ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 @@ -994,12 +1033,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232-NEXT: ; implicit-def: $vgpr0 -; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1232-NEXT: s_cbranch_execz .LBB2_4 +; GFX1232-NEXT: s_xor_b32 s5, vcc_lo, exec_lo +; GFX1232-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1232-NEXT: ; %bb.3: ; GFX1232-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 @@ -1010,8 +1050,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB2_4: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232-NEXT: .LBB2_4: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 @@ -1033,13 +1073,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1053,8 +1095,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB3_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: .LBB3_2: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -1071,14 +1113,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX89-LABEL: add_i64_constant: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b64 s[6:7], exec +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX89-NEXT: s_mov_b64 s[4:5], exec +; GFX89-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX89-NEXT: s_cbranch_execz .LBB3_2 +; GFX89-NEXT: s_cmov_b64 exec, vcc +; GFX89-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX89-NEXT: ; %bb.1: ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s8, s2 @@ -1092,8 +1136,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: buffer_wbinvl1_vol -; GFX89-NEXT: .LBB3_2: ; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX89-NEXT: .LBB3_2: ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_readfirstlane_b32 s2, v0 ; GFX89-NEXT: v_readfirstlane_b32 s3, v1 @@ -1110,12 +1154,14 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB3_2 +; GFX1064-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -1130,9 +1176,9 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB3_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: .LBB3_2: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -1146,11 +1192,13 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB3_2 +; GFX1032-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -1165,9 +1213,9 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB3_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: .LBB3_2: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -1186,8 +1234,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB3_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -1202,8 +1252,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB3_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB3_2: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -1224,8 +1274,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB3_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -1239,8 +1291,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB3_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: .LBB3_2: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -1257,14 +1309,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_mov_b32 s9, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1264-NEXT: s_cbranch_execz .LBB3_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1264-NEXT: s_and_b64 s[10:11], vcc, -1 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 @@ -1278,8 +1332,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB3_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-NEXT: .LBB3_2: ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 @@ -1296,13 +1350,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232-NEXT: s_mov_b32 s4, exec_lo -; GFX1232-NEXT: s_mov_b32 s5, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 ; GFX1232-NEXT: s_mov_b32 s6, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 +; GFX1232-NEXT: s_mov_b32 s5, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1232-NEXT: s_cbranch_execz .LBB3_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1232-NEXT: s_and_b32 s7, vcc_lo, -1 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 @@ -1315,8 +1371,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB3_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232-NEXT: .LBB3_2: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 @@ -1338,14 +1394,16 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 @@ -1362,8 +1420,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: .LBB4_2: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 @@ -1382,15 +1440,17 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB4_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 @@ -1405,8 +1465,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1422,33 +1482,35 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[10:11], exec ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[10:11] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s2, s6 +; GFX9-NEXT: s_mul_i32 s1, s3, s0 +; GFX9-NEXT: s_mul_hi_u32 s6, s2, s0 +; GFX9-NEXT: s_add_i32 s6, s6, s1 +; GFX9-NEXT: s_mul_i32 s0, s2, s0 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1466,33 +1528,35 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[10:11] +; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s9, s3, s8 -; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 -; GFX1064-NEXT: s_mul_i32 s8, s2, s8 -; GFX1064-NEXT: s_add_i32 s10, s10, s9 -; GFX1064-NEXT: v_mov_b32_e32 v0, s8 +; GFX1064-NEXT: s_mul_i32 s1, s3, s0 +; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s0 +; GFX1064-NEXT: s_mul_i32 s0, s2, s0 +; GFX1064-NEXT: s_add_i32 s10, s10, s1 +; GFX1064-NEXT: v_mov_b32_e32 v0, s0 ; GFX1064-NEXT: v_mov_b32_e32 v1, s10 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s8, s6 -; GFX1064-NEXT: s_mov_b32 s9, s7 -; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1508,32 +1572,34 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1032-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 -; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s9 +; GFX1032-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s8, s3, s1 -; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 -; GFX1032-NEXT: s_mul_i32 s1, s2, s1 -; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: s_mul_i32 s1, s3, s0 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s0 +; GFX1032-NEXT: s_mul_i32 s0, s2, s0 +; GFX1032-NEXT: s_add_i32 s9, s9, s1 +; GFX1032-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s8, s6 -; GFX1032-NEXT: s_mov_b32 s9, s7 -; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1555,8 +1621,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[10:11], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 @@ -1574,8 +1642,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB4_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB4_2: ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -1601,8 +1669,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s8, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -1620,8 +1690,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB4_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: .LBB4_2: ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,14 +1713,16 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-NEXT: s_mov_b32 s11, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1264-NEXT: s_cbranch_execz .LBB4_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1264-NEXT: s_and_b64 s[12:13], vcc, -1 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] ; GFX1264-NEXT: s_wait_kmcnt 0x0 @@ -1664,8 +1736,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB4_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1264-NEXT: .LBB4_2: ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 @@ -1685,13 +1757,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1232-NEXT: s_mov_b32 s2, exec_lo -; GFX1232-NEXT: s_mov_b32 s3, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 ; GFX1232-NEXT: s_mov_b32 s8, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 +; GFX1232-NEXT: s_mov_b32 s3, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1232-NEXT: s_cbranch_execz .LBB4_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1232-NEXT: s_and_b32 s9, vcc_lo, -1 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 @@ -1704,8 +1778,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB4_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232-NEXT: .LBB4_2: ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 @@ -1837,13 +1911,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB6_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1856,8 +1932,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB6_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: .LBB6_2: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -1869,14 +1945,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB6_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 @@ -1889,8 +1967,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1902,14 +1980,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s2 @@ -1922,8 +2002,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1937,12 +2017,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB6_2 +; GFX1064-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 @@ -1956,9 +2038,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB6_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: .LBB6_2: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1972,11 +2054,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB6_2 +; GFX1032-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 @@ -1990,9 +2074,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB6_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: .LBB6_2: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -2011,8 +2095,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 @@ -2026,8 +2112,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB6_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB6_2: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -2048,8 +2134,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -2063,8 +2151,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB6_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: .LBB6_2: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -2086,8 +2174,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1264-NEXT: s_cbranch_execz .LBB6_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 @@ -2100,8 +2190,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB6_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-NEXT: .LBB6_2: ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -2122,8 +2212,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1232-NEXT: s_cbranch_execz .LBB6_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 @@ -2136,8 +2228,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB6_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1232-NEXT: .LBB6_2: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -2158,65 +2250,69 @@ entry: define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %subitive) { ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[8:9] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 -; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s8, s6 +; GFX7LESS-NEXT: s_mov_b32 s9, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: .LBB7_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[8:9] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s2, s8, s2 +; GFX8-NEXT: s_mul_i32 s1, s0, s1 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 @@ -2226,31 +2322,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_load_dword s10, s[0:1], 0x34 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s2, s8, s2 +; GFX9-NEXT: s_mul_i32 s0, s10, s0 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s10, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -2262,32 +2360,34 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX1064-NEXT: s_load_dword s10, s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB7_2 +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s2, s8, s2 +; GFX1064-NEXT: s_mul_i32 s0, s10, s0 ; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-NEXT: s_mov_b32 s12, s6 ; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB7_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB7_2: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s10, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -2300,28 +2400,30 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB7_2 +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: s_mul_i32 s0, s2, s0 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: s_mov_b32 s8, s6 ; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 @@ -2335,32 +2437,34 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1164-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1164-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB7_2 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[10:11], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 +; GFX1164-NEXT: s_bcnt1_i32_b64 s1, s[8:9] +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s2, s8, s2 -; GFX1164-NEXT: s_mov_b32 s14, -1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 -; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc +; GFX1164-NEXT: s_mul_i32 s1, s0, s1 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s1 +; GFX1164-NEXT: s_mov_b32 s8, s6 +; GFX1164-NEXT: s_mov_b32 s9, s7 +; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: .LBB7_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX1164-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s6, -1 @@ -2376,28 +2480,30 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB7_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_mul_i32 s1, s0, s1 ; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132-NEXT: s_mov_b32 s8, s6 ; GFX1132-NEXT: s_mov_b32 s9, s7 ; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: .LBB7_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 @@ -2414,31 +2520,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 ; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1264-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1264-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b64 s[2:3], exec -; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1264-NEXT: s_cbranch_execz .LBB7_2 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264-NEXT: s_and_b64 s[10:11], vcc, -1 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s1, s[8:9] +; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_i32 s2, s8, s2 -; GFX1264-NEXT: s_mov_b32 s14, -1 -; GFX1264-NEXT: v_mov_b32_e32 v1, s2 -; GFX1264-NEXT: s_mov_b32 s12, s6 -; GFX1264-NEXT: s_mov_b32 s13, s7 -; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN +; GFX1264-NEXT: s_mul_i32 s1, s0, s1 +; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: v_mov_b32_e32 v1, s1 +; GFX1264-NEXT: s_mov_b32 s8, s6 +; GFX1264-NEXT: s_mov_b32 s9, s7 +; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV +; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-NEXT: .LBB7_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX1264-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 @@ -2454,27 +2562,29 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_clause 0x1 ; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1232-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1232-NEXT: s_mov_b32 s3, exec_lo ; GFX1232-NEXT: s_mov_b32 s2, exec_lo -; GFX1232-NEXT: s_mov_b32 s1, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1232-NEXT: s_cbranch_execz .LBB7_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1232-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mul_i32 s2, s0, s2 +; GFX1232-NEXT: s_mul_i32 s1, s0, s1 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: v_mov_b32_e32 v1, s2 +; GFX1232-NEXT: v_mov_b32_e32 v1, s1 ; GFX1232-NEXT: s_mov_b32 s8, s6 ; GFX1232-NEXT: s_mov_b32 s9, s7 ; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1232-NEXT: .LBB7_2: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s0, v1 @@ -2532,10 +2642,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX8-NEXT: s_cbranch_execz .LBB8_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 @@ -2546,8 +2657,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB8_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB8_4: ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -2577,10 +2688,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 @@ -2591,8 +2703,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB8_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB8_4: ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -2622,9 +2734,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064-NEXT: s_cbranch_execz .LBB8_4 +; GFX1064-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 @@ -2636,9 +2749,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB8_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: .LBB8_4: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -2667,9 +2780,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1032-NEXT: s_cbranch_execz .LBB8_4 +; GFX1032-NEXT: s_xor_b32 s5, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 @@ -2681,9 +2795,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB8_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032-NEXT: .LBB8_4: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -2715,10 +2829,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1164-NEXT: s_cbranch_execz .LBB8_4 +; GFX1164-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 @@ -2730,8 +2844,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB8_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB8_4: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -2762,12 +2876,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1132-NEXT: s_cbranch_execz .LBB8_4 +; GFX1132-NEXT: s_xor_b32 s5, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -2779,8 +2894,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB8_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132-NEXT: .LBB8_4: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -2815,10 +2930,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0 -; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1264-NEXT: s_cbranch_execz .LBB8_4 +; GFX1264-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1264-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX1264-NEXT: ; %bb.3: ; GFX1264-NEXT: v_mov_b32_e32 v0, s6 ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 @@ -2829,8 +2944,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB8_4: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-NEXT: .LBB8_4: ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 @@ -2861,12 +2976,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232-NEXT: ; implicit-def: $vgpr0 -; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5 -; GFX1232-NEXT: s_cbranch_execz .LBB8_4 +; GFX1232-NEXT: s_xor_b32 s5, vcc_lo, exec_lo +; GFX1232-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX1232-NEXT: ; %bb.3: ; GFX1232-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 @@ -2877,8 +2993,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB8_4: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232-NEXT: .LBB8_4: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 @@ -2900,13 +3016,15 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -2920,8 +3038,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB9_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: .LBB9_2: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -2938,14 +3056,16 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB9_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 @@ -2959,8 +3079,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2976,14 +3096,16 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s2 @@ -2997,8 +3119,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -3016,12 +3138,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_2 +; GFX1064-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -3036,9 +3160,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -3055,11 +3179,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_2 +; GFX1032-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -3074,9 +3200,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -3098,8 +3224,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB9_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -3114,8 +3242,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB9_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB9_2: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -3139,8 +3267,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB9_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -3154,8 +3284,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB9_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: .LBB9_2: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -3175,14 +3305,16 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_mov_b32 s9, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1264-NEXT: s_cbranch_execz .LBB9_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1264-NEXT: s_and_b64 s[10:11], vcc, -1 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 @@ -3196,8 +3328,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB9_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-NEXT: .LBB9_2: ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -3217,13 +3349,15 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232-NEXT: s_mov_b32 s4, exec_lo -; GFX1232-NEXT: s_mov_b32 s5, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 ; GFX1232-NEXT: s_mov_b32 s6, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 +; GFX1232-NEXT: s_mov_b32 s5, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1232-NEXT: s_cbranch_execz .LBB9_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1232-NEXT: s_and_b32 s7, vcc_lo, -1 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 @@ -3236,8 +3370,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB9_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232-NEXT: .LBB9_2: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -3262,14 +3396,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB10_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 @@ -3286,8 +3422,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 -; GFX7LESS-NEXT: .LBB10_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: .LBB10_2: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 @@ -3306,15 +3442,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB10_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 @@ -3329,8 +3467,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB10_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB10_2: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 @@ -3347,33 +3485,35 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[10:11], exec ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[10:11] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s2, s6 +; GFX9-NEXT: s_mul_i32 s1, s3, s0 +; GFX9-NEXT: s_mul_hi_u32 s6, s2, s0 +; GFX9-NEXT: s_add_i32 s6, s6, s1 +; GFX9-NEXT: s_mul_i32 s0, s2, s0 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: .LBB10_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -3393,33 +3533,35 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s11, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB10_2 +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[10:11] +; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s9, s3, s8 -; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 -; GFX1064-NEXT: s_mul_i32 s8, s2, s8 -; GFX1064-NEXT: s_add_i32 s10, s10, s9 -; GFX1064-NEXT: v_mov_b32_e32 v0, s8 +; GFX1064-NEXT: s_mul_i32 s1, s3, s0 +; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s0 +; GFX1064-NEXT: s_mul_i32 s0, s2, s0 +; GFX1064-NEXT: s_add_i32 s10, s10, s1 +; GFX1064-NEXT: v_mov_b32_e32 v0, s0 ; GFX1064-NEXT: v_mov_b32_e32 v1, s10 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s8, s6 -; GFX1064-NEXT: s_mov_b32 s9, s7 -; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB10_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: .LBB10_2: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 @@ -3438,32 +3580,34 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1032-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB10_2 +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 -; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s9 +; GFX1032-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s8, s3, s1 -; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 -; GFX1032-NEXT: s_mul_i32 s1, s2, s1 -; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: s_mul_i32 s1, s3, s0 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s0 +; GFX1032-NEXT: s_mul_i32 s0, s2, s0 +; GFX1032-NEXT: s_add_i32 s9, s9, s1 +; GFX1032-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s8, s6 -; GFX1032-NEXT: s_mov_b32 s9, s7 -; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc +; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB10_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: .LBB10_2: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s2, v2, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 @@ -3488,8 +3632,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB10_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[10:11], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 @@ -3507,8 +3653,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB10_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB10_2: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0 ; GFX1164-NEXT: v_readfirstlane_b32 s0, v0 @@ -3536,8 +3682,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB10_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s8, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 @@ -3555,8 +3703,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB10_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: .LBB10_2: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0 ; GFX1132-NEXT: v_readfirstlane_b32 s0, v0 @@ -3580,14 +3728,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-NEXT: s_mov_b32 s11, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1264-NEXT: s_cbranch_execz .LBB10_2 +; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1264-NEXT: s_and_b64 s[12:13], vcc, -1 +; GFX1264-NEXT: s_cmov_b64 exec, vcc +; GFX1264-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] ; GFX1264-NEXT: s_wait_kmcnt 0x0 @@ -3601,8 +3751,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV -; GFX1264-NEXT: .LBB10_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1264-NEXT: .LBB10_2: ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0 ; GFX1264-NEXT: v_readfirstlane_b32 s0, v0 @@ -3626,13 +3776,15 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1232-NEXT: s_mov_b32 s2, exec_lo -; GFX1232-NEXT: s_mov_b32 s3, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 ; GFX1232-NEXT: s_mov_b32 s8, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 +; GFX1232-NEXT: s_mov_b32 s3, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1232-NEXT: s_cbranch_execz .LBB10_2 +; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1232-NEXT: s_and_b32 s9, vcc_lo, -1 +; GFX1232-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1232-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 @@ -3645,8 +3797,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV -; GFX1232-NEXT: .LBB10_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232-NEXT: .LBB10_2: ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0 ; GFX1232-NEXT: v_readfirstlane_b32 s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index af6f69130910d..0c0fc75094b01 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -20,12 +20,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 @@ -34,8 +36,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB0_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: .LBB0_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -51,9 +53,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 @@ -62,8 +66,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -79,9 +83,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -89,8 +95,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -103,12 +109,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_2 +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -117,9 +125,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -132,11 +140,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_2 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -145,9 +155,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -165,8 +175,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -176,8 +188,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -197,8 +209,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -207,8 +221,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -232,13 +246,15 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -248,8 +264,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB1_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: .LBB1_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -262,14 +278,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -279,8 +297,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -293,14 +311,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -309,8 +329,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -325,12 +345,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB1_2 +; GFX1064-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -340,9 +362,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -356,11 +378,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032-NEXT: s_mov_b32 s4, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB1_2 +; GFX1032-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -370,9 +394,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -391,8 +415,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB1_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -403,8 +429,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB1_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB1_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -425,8 +451,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB1_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -436,8 +464,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB1_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB1_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -491,18 +519,19 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -532,17 +561,18 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -572,18 +602,19 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB2_4 +; GFX1064-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB2_4: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -612,18 +643,19 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB2_4 +; GFX1032-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB2_4: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -655,18 +687,18 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB2_4 +; GFX1164-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB2_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB2_4: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -697,20 +729,21 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB2_4 +; GFX1132-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB2_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB2_4: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -755,9 +788,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB3_4 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 @@ -784,9 +818,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -812,9 +847,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_4 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 @@ -840,9 +876,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB3_4 +; GFX1032-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -869,12 +906,13 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 @@ -901,11 +939,12 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB3_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: ds_add_u32 v0, v1 @@ -925,12 +964,14 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 @@ -939,8 +980,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: .LBB4_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -961,9 +1002,11 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB4_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 @@ -972,8 +1015,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -993,9 +1036,11 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -1003,8 +1048,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -1021,12 +1066,14 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -1035,9 +1082,9 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -1051,11 +1098,13 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -1064,9 +1113,9 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -1085,8 +1134,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -1096,8 +1147,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB4_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB4_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -1118,8 +1169,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 @@ -1129,8 +1182,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB4_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: .LBB4_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -1155,13 +1208,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 @@ -1175,8 +1230,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB5_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: .LBB5_2: ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,14 +1251,16 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB5_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, s8 @@ -1215,8 +1272,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -1234,14 +1291,16 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1254,8 +1313,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1274,12 +1333,14 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -1293,9 +1354,9 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB5_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: .LBB5_2: ; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1310,11 +1371,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -1328,9 +1391,9 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB5_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: .LBB5_2: ; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1350,8 +1413,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -1365,8 +1430,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB5_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB5_2: ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -1390,8 +1455,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -1405,8 +1472,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB5_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: .LBB5_2: ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -1509,12 +1576,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 @@ -1523,8 +1592,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB7_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: .LBB7_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -1541,9 +1610,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 @@ -1552,8 +1623,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB7_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB7_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1570,9 +1641,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -1580,8 +1653,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB7_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB7_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1595,12 +1668,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: sub_i32_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB7_2 +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -1609,9 +1684,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB7_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB7_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1625,11 +1700,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: sub_i32_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB7_2 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -1638,9 +1715,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1659,8 +1736,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB7_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -1670,8 +1749,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB7_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB7_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1692,8 +1771,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB7_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1702,8 +1783,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB7_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: .LBB7_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1728,13 +1809,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1744,8 +1827,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB8_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: .LBB8_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -1758,14 +1841,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB8_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1775,8 +1860,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB8_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB8_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1789,14 +1874,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1805,8 +1892,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1821,12 +1908,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -1836,9 +1925,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1853,11 +1942,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032-NEXT: s_mov_b32 s4, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -1867,9 +1958,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1889,8 +1980,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB8_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -1901,8 +1994,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB8_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB8_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1924,8 +2017,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB8_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -1935,8 +2030,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB8_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB8_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1991,18 +2086,19 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB9_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB9_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB9_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -2032,17 +2128,18 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB9_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB9_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -2072,18 +2169,19 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB9_4 +; GFX1064-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB9_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB9_4: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -2112,18 +2210,19 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB9_4 +; GFX1032-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB9_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB9_4: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -2155,18 +2254,18 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB9_4 +; GFX1164-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB9_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB9_4: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -2197,20 +2296,21 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB9_4 +; GFX1132-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB9_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB9_4: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -2255,9 +2355,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB10_4 +; GFX8-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX8-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 @@ -2284,9 +2385,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB10_4 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -2312,9 +2414,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB10_4 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 @@ -2340,9 +2443,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032-NEXT: s_cbranch_execz .LBB10_4 +; GFX1032-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -2369,12 +2473,13 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB10_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 @@ -2401,11 +2506,12 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132-NEXT: s_cbranch_execz .LBB10_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: ds_sub_u32 v0, v1 @@ -2425,12 +2531,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 @@ -2439,8 +2547,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB11_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: .LBB11_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -2461,9 +2569,11 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB11_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 @@ -2472,8 +2582,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB11_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB11_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -2494,9 +2604,11 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -2504,8 +2616,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB11_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB11_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -2523,12 +2635,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB11_2 +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -2537,9 +2651,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB11_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB11_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2556,11 +2670,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB11_2 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -2569,9 +2685,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB11_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: .LBB11_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2593,8 +2709,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB11_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -2604,8 +2722,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB11_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB11_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2629,8 +2747,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB11_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 @@ -2640,8 +2760,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB11_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: .LBB11_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2669,13 +2789,15 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 @@ -2689,8 +2811,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB12_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: .LBB12_2: ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -2710,14 +2832,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB12_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, s8 @@ -2729,8 +2853,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB12_2: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s0 ; GFX8-NEXT: s_mov_b32 s5, s1 @@ -2749,14 +2873,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2769,8 +2895,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB12_2: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 ; GFX9-NEXT: s_mov_b32 s4, s0 @@ -2791,12 +2917,14 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB12_2 +; GFX1064-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -2810,9 +2938,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB12_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: .LBB12_2: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 @@ -2830,11 +2958,13 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB12_2 +; GFX1032-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -2848,9 +2978,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB12_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: .LBB12_2: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 @@ -2873,8 +3003,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB12_2 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -2888,8 +3020,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB12_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB12_2: ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 @@ -2915,8 +3047,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB12_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -2930,8 +3064,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB12_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: .LBB12_2: ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 @@ -3066,18 +3200,19 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB14_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB14_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB14_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -3107,17 +3242,18 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB14_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB14_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -3147,18 +3283,19 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB14_4 +; GFX1064-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB14_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB14_4: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -3187,18 +3324,19 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB14_4 +; GFX1032-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB14_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB14_4: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -3230,18 +3368,18 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB14_4 +; GFX1164-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB14_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB14_4: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -3272,20 +3410,21 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB14_4 +; GFX1132-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB14_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB14_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB14_4: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -3340,18 +3479,19 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB15_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB15_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB15_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -3381,17 +3521,18 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB15_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB15_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -3421,18 +3562,19 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB15_4 +; GFX1064-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB15_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB15_4: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -3461,18 +3603,19 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB15_4 +; GFX1032-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB15_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB15_4: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -3504,18 +3647,18 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB15_4 +; GFX1164-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB15_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB15_4: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -3546,20 +3689,21 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB15_4 +; GFX1132-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB15_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB15_4: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -3614,18 +3758,19 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB16_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB16_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB16_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -3655,17 +3800,18 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB16_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB16_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -3695,18 +3841,19 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB16_4 +; GFX1064-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB16_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB16_4: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -3735,18 +3882,19 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB16_4 +; GFX1032-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB16_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB16_4: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -3778,18 +3926,18 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB16_4 +; GFX1164-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB16_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB16_4: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -3820,20 +3968,21 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB16_4 +; GFX1132-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB16_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB16_4: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -3888,18 +4037,19 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB17_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB17_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB17_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -3929,17 +4079,18 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB17_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB17_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -3969,18 +4120,19 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB17_4 +; GFX1064-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB17_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB17_4: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -4009,18 +4161,19 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB17_4 +; GFX1032-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB17_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB17_4: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -4052,18 +4205,18 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB17_4 +; GFX1164-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB17_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB17_4: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -4094,20 +4247,21 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB17_4 +; GFX1132-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB17_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB17_4: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -4131,12 +4285,14 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: max_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -4144,8 +4300,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB18_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: .LBB18_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -4168,9 +4324,11 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB18_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -4178,8 +4336,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB18_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB18_2: ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -4202,17 +4360,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB18_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB18_2: ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -4233,11 +4393,13 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: max_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB18_2 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -4245,9 +4407,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB18_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB18_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -4265,10 +4427,12 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: max_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB18_2 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -4276,9 +4440,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB18_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: .LBB18_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -4296,12 +4460,14 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: max_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB18_2 +; GFX1164-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -4309,8 +4475,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB18_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB18_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -4331,19 +4497,21 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-LABEL: max_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB18_2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB18_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB18_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: .LBB18_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -4402,18 +4570,19 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB19_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB19_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB19_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -4443,17 +4612,18 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB19_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB19_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -4483,18 +4653,19 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB19_4 +; GFX1064-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB19_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB19_4: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -4523,18 +4694,19 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB19_4 +; GFX1032-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB19_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB19_4: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -4566,18 +4738,18 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB19_4 +; GFX1164-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB19_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB19_4: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -4608,20 +4780,21 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB19_4 +; GFX1132-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB19_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB19_4: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -4645,12 +4818,14 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: min_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -4658,8 +4833,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB20_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: .LBB20_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -4682,9 +4857,11 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB20_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -4692,8 +4869,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB20_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB20_2: ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -4716,17 +4893,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB20_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB20_2: ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -4747,11 +4926,13 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: min_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB20_2 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -4759,9 +4940,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB20_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB20_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -4779,10 +4960,12 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: min_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB20_2 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -4790,9 +4973,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB20_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: .LBB20_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -4810,12 +4993,14 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: min_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB20_2 +; GFX1164-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -4823,8 +5008,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB20_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB20_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -4845,19 +5030,21 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-LABEL: min_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB20_2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB20_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB20_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: .LBB20_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -4916,18 +5103,19 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB21_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB21_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB21_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -4957,17 +5145,18 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB21_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB21_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -4997,18 +5186,19 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB21_4 +; GFX1064-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB21_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB21_4: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -5037,18 +5227,19 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB21_4 +; GFX1032-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB21_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB21_4: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -5080,18 +5271,18 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB21_4 +; GFX1164-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB21_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB21_4: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -5122,20 +5313,21 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB21_4 +; GFX1132-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB21_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB21_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB21_4: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -5159,12 +5351,14 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: umax_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -5172,8 +5366,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB22_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: .LBB22_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -5195,9 +5389,11 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB22_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -5205,8 +5401,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB22_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -5228,17 +5424,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB22_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -5258,11 +5456,13 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: umax_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB22_2 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -5270,9 +5470,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB22_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -5290,10 +5490,12 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: umax_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB22_2 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -5301,9 +5503,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: .LBB22_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -5321,12 +5523,14 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: umax_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB22_2 +; GFX1164-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -5334,8 +5538,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB22_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB22_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -5356,19 +5560,21 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-LABEL: umax_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB22_2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB22_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: .LBB22_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -5427,18 +5633,19 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB23_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB23_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB23_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -5468,17 +5675,18 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB23_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB23_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -5508,18 +5716,19 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB23_4 +; GFX1064-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1064-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB23_4: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB23_4: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -5548,18 +5757,19 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB23_4 +; GFX1032-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB23_4: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: .LBB23_4: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -5591,18 +5801,18 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execz .LBB23_4 +; GFX1164-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX1164-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB23_4: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB23_4: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -5633,20 +5843,21 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132-NEXT: s_cbranch_execz .LBB23_4 +; GFX1132-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB23_4: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB23_4: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -5670,12 +5881,14 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; ; GFX7LESS-LABEL: umin_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -5683,8 +5896,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: .LBB24_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: .LBB24_2: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -5706,9 +5919,11 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB24_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -5716,8 +5931,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB24_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB24_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -5739,17 +5954,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB24_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB24_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -5769,11 +5986,13 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-LABEL: umin_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB24_2 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -5781,9 +6000,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB24_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB24_2: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -5801,10 +6020,12 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-LABEL: umin_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB24_2 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -5812,9 +6033,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB24_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: .LBB24_2: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -5832,12 +6053,14 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: umin_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB24_2 +; GFX1164-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -5845,8 +6068,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB24_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: .LBB24_2: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -5867,19 +6090,21 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-LABEL: umin_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB24_2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB24_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: .LBB24_2: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 29704959fc176..dbbd2363a2412 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -16,30 +16,34 @@ declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspace(8) inreg %inout) { ; GFX7-LABEL: add_i32_constant: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[10:11], exec +; GFX7-NEXT: s_mov_b64 s[8:9], exec +; GFX7-NEXT: s_and_b64 s[10:11], exec, exec +; GFX7-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX7-NEXT: ; implicit-def: $vgpr0 -; GFX7-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX7-NEXT: s_cbranch_execz .LBB0_4 +; GFX7-NEXT: s_cmov_b64 exec, s[10:11] +; GFX7-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: s_mov_b64 s[10:11], exec ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s13, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: s_and_b64 s[14:15], vcc, -1 ; GFX7-NEXT: ; implicit-def: $vgpr1 -; GFX7-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX7-NEXT: s_cbranch_execz .LBB0_3 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_bcnt1_i32_b64 s12, s[12:13] ; GFX7-NEXT: s_mul_i32 s12, s12, 5 ; GFX7-NEXT: v_mov_b32_e32 v1, s12 ; GFX7-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX7-NEXT: .LBB0_3: ; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7-NEXT: .LBB0_3: ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX7-NEXT: .LBB0_4: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: .LBB0_4: ; %Flow ; GFX7-NEXT: s_wqm_b64 s[4:5], -1 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] ; GFX7-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -51,30 +55,34 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; ; GFX89-LABEL: add_i32_constant: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_mov_b64 s[10:11], exec +; GFX89-NEXT: s_and_b64 s[10:11], exec, exec +; GFX89-NEXT: s_mov_b64 s[8:9], exec +; GFX89-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX89-NEXT: ; implicit-def: $vgpr0 -; GFX89-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX89-NEXT: s_cbranch_execz .LBB0_4 +; GFX89-NEXT: s_cmov_b64 exec, s[10:11] +; GFX89-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX89-NEXT: ; %bb.1: ; GFX89-NEXT: s_mov_b64 s[12:13], exec ; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 ; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 ; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX89-NEXT: s_mov_b64 s[10:11], exec +; GFX89-NEXT: s_and_b64 s[14:15], vcc, -1 ; GFX89-NEXT: ; implicit-def: $vgpr1 -; GFX89-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX89-NEXT: s_cbranch_execz .LBB0_3 +; GFX89-NEXT: s_cmov_b64 exec, vcc +; GFX89-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX89-NEXT: ; %bb.2: ; GFX89-NEXT: s_bcnt1_i32_b64 s12, s[12:13] ; GFX89-NEXT: s_mul_i32 s12, s12, 5 ; GFX89-NEXT: v_mov_b32_e32 v1, s12 ; GFX89-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX89-NEXT: .LBB0_3: ; GFX89-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX89-NEXT: .LBB0_3: ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_readfirstlane_b32 s4, v1 ; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX89-NEXT: .LBB0_4: ; %Flow ; GFX89-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX89-NEXT: .LBB0_4: ; %Flow ; GFX89-NEXT: s_wqm_b64 s[4:5], -1 ; GFX89-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] ; GFX89-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -86,31 +94,35 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[10:11], exec +; GFX1064-NEXT: s_and_b64 s[10:11], exec, exec +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX1064-NEXT: s_cbranch_execz .LBB0_4 +; GFX1064-NEXT: s_cmov_b64 exec, s[10:11] +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_mov_b64 s[12:13], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: s_and_b64 s[14:15], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-NEXT: ; %bb.2: ; GFX1064-NEXT: s_bcnt1_i32_b64 s12, s[12:13] ; GFX1064-NEXT: s_mul_i32 s12, s12, 5 ; GFX1064-NEXT: v_mov_b32_e32 v1, s12 ; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX1064-NEXT: .LBB0_3: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1064-NEXT: .LBB0_3: ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX1064-NEXT: .LBB0_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: .LBB0_4: ; %Flow ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 ; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] ; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -122,30 +134,34 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s9, exec_lo +; GFX1032-NEXT: s_and_b32 s9, exec_lo, exec_lo +; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_and_b32 s10, s9, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 -; GFX1032-NEXT: s_cbranch_execz .LBB0_4 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s9 +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_mov_b32 s10, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: s_and_b32 s11, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-NEXT: ; %bb.2: ; GFX1032-NEXT: s_bcnt1_i32_b32 s10, s10 ; GFX1032-NEXT: s_mul_i32 s10, s10, 5 ; GFX1032-NEXT: v_mov_b32_e32 v1, s10 ; GFX1032-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX1032-NEXT: .LBB0_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: .LBB0_4: ; %Flow ; GFX1032-NEXT: s_wqm_b32 s4, -1 ; GFX1032-NEXT: s_and_b32 s4, s4, s4 ; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 @@ -157,11 +173,12 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; ; GFX1164-LABEL: add_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[10:11], exec +; GFX1164-NEXT: s_and_b64 s[10:11], exec, exec +; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX1164-NEXT: s_cbranch_execz .LBB0_4 +; GFX1164-NEXT: s_cmov_b64 exec, s[10:11] +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_mov_b64 s[12:13], exec ; GFX1164-NEXT: s_mov_b64 s[10:11], exec @@ -169,22 +186,24 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[14:15], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1164-NEXT: ; %bb.2: ; GFX1164-NEXT: s_bcnt1_i32_b64 s12, s[12:13] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_mul_i32 s12, s12, 5 ; GFX1164-NEXT: v_mov_b32_e32 v1, s12 ; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc -; GFX1164-NEXT: .LBB0_3: ; GFX1164-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1164-NEXT: .LBB0_3: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX1164-NEXT: .LBB0_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: .LBB0_4: ; %Flow ; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] @@ -199,33 +218,36 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; ; GFX1132-LABEL: add_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s9, exec_lo +; GFX1132-NEXT: s_and_b32 s9, exec_lo, exec_lo +; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_and_b32 s10, s9, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_saveexec_b32 s8, s9 -; GFX1132-NEXT: s_cbranch_execz .LBB0_4 +; GFX1132-NEXT: s_cmov_b32 exec_lo, s9 +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_mov_b32 s10, exec_lo ; GFX1132-NEXT: s_mov_b32 s9, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s11, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1132-NEXT: ; %bb.2: ; GFX1132-NEXT: s_bcnt1_i32_b32 s10, s10 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_mul_i32 s10, s10, 5 ; GFX1132-NEXT: v_mov_b32_e32 v1, s10 ; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc -; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX1132-NEXT: .LBB0_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: .LBB0_4: ; %Flow ; GFX1132-NEXT: s_wqm_b32 s4, -1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_b32 s4, s4, s4 @@ -266,22 +288,25 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b64 s[10:11], exec +; GFX8-NEXT: s_and_b64 s[10:11], s[10:11], exec ; GFX8-NEXT: s_mov_b64 s[8:9], exec -; GFX8-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX8-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX8-NEXT: ; implicit-def: $vgpr3 -; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX8-NEXT: s_cbranch_execz .LBB1_4 +; GFX8-NEXT: s_cmov_b64 exec, s[10:11] +; GFX8-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX8-NEXT: s_mov_b64 s[10:11], exec +; GFX8-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[10:11] +; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -293,25 +318,26 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s12, v2, 63 +; GFX8-NEXT: v_readlane_b32 s14, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[10:11] +; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_and_b64 s[12:13], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_3 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX8-NEXT: .LBB1_3: ; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: .LBB1_3: ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v0 -; GFX8-NEXT: .LBB1_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: .LBB1_4: ; %Flow ; GFX8-NEXT: s_wqm_b64 s[4:5], -1 ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] ; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -323,22 +349,25 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b64 s[10:11], exec +; GFX9-NEXT: s_and_b64 s[10:11], s[10:11], exec ; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX9-NEXT: s_cbranch_execz .LBB1_4 +; GFX9-NEXT: s_cmov_b64 exec, s[10:11] +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-NEXT: s_mov_b64 s[10:11], exec +; GFX9-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -350,25 +379,26 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s12, v2, 63 +; GFX9-NEXT: v_readlane_b32 s14, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_b64 s[12:13], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX9-NEXT: ; %bb.2: -; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v0, s14 ; GFX9-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX9-NEXT: .LBB1_3: ; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX9-NEXT: .LBB1_3: ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_add_u32_e32 v3, s4, v0 -; GFX9-NEXT: .LBB1_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: .LBB1_4: ; %Flow ; GFX9-NEXT: s_wqm_b64 s[4:5], -1 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -380,17 +410,20 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_and_b64 s[10:11], s[10:11], exec ; GFX1064-NEXT: ; implicit-def: $vgpr4 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX1064-NEXT: s_cbranch_execz .LBB1_4 +; GFX1064-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; GFX1064-NEXT: s_cmov_b64 exec, s[10:11] +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1064-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -399,40 +432,44 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1064-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s12, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s12 +; GFX1064-NEXT: v_readlane_b32 s14, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s14 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s12, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s13, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s12, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[10:11] +; GFX1064-NEXT: v_readlane_b32 s14, v1, 15 +; GFX1064-NEXT: s_mov_b64 exec, s[12:13] ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1064-NEXT: v_readlane_b32 s12, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s14, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s13, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[10:11] +; GFX1064-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX1064-NEXT: v_readlane_b32 s15, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s14, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[12:13] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s14, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[10:11] +; GFX1064-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX1064-NEXT: v_readlane_b32 s14, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s15, 32 +; GFX1064-NEXT: v_readlane_b32 s15, v1, 63 +; GFX1064-NEXT: s_mov_b64 exec, s[12:13] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX1064-NEXT: v_writelane_b32 v3, s14, 48 +; GFX1064-NEXT: s_mov_b64 exec, s[12:13] +; GFX1064-NEXT: s_mov_b32 s12, s15 +; GFX1064-NEXT: s_and_b64 s[14:15], vcc, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1064-NEXT: ; %bb.2: ; GFX1064-NEXT: v_mov_b32_e32 v0, s12 ; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX1064-NEXT: .LBB1_3: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1064-NEXT: .LBB1_3: ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_add_nc_u32_e32 v4, s4, v0 -; GFX1064-NEXT: .LBB1_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: .LBB1_4: ; %Flow ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 ; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] ; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -444,17 +481,20 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_and_b32 s9, s9, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr4 -; GFX1032-NEXT: s_mov_b32 s9, s8 -; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 -; GFX1032-NEXT: s_cbranch_execz .LBB1_4 +; GFX1032-NEXT: s_and_b32 s10, s9, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s9 +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 +; GFX1032-NEXT: s_or_saveexec_b32 s10, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -463,30 +503,33 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s11, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s10, v1, 15 -; GFX1032-NEXT: s_mov_b32 exec_lo, s9 +; GFX1032-NEXT: s_mov_b32 exec_lo, s10 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s10, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s9 +; GFX1032-NEXT: s_or_saveexec_b32 s10, -1 +; GFX1032-NEXT: v_readlane_b32 s12, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s11, v1, 15 +; GFX1032-NEXT: s_mov_b32 exec_lo, s10 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_or_saveexec_b32 s10, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s11, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s10 +; GFX1032-NEXT: s_and_b32 s11, vcc_lo, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1032-NEXT: ; %bb.2: -; GFX1032-NEXT: v_mov_b32_e32 v0, s11 +; GFX1032-NEXT: v_mov_b32_e32 v0, s12 ; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX1032-NEXT: .LBB1_3: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX1032-NEXT: .LBB1_3: ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_add_nc_u32_e32 v4, s4, v0 -; GFX1032-NEXT: .LBB1_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: .LBB1_4: ; %Flow ; GFX1032-NEXT: s_wqm_b32 s4, -1 ; GFX1032-NEXT: s_and_b32 s4, s4, s4 ; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 @@ -498,18 +541,21 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_and_b64 s[10:11], s[10:11], exec ; GFX1164-NEXT: ; implicit-def: $vgpr4 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX1164-NEXT: s_cbranch_execz .LBB1_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; GFX1164-NEXT: s_cmov_b64 exec, s[10:11] +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_not_b64 exec, exec ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1164-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -522,44 +568,47 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s12, v1, 31 +; GFX1164-NEXT: v_readlane_b32 s14, v1, 31 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s12 +; GFX1164-NEXT: v_mov_b32_e32 v2, s14 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s12, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s13, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s12, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[10:11] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s14, v1, 15 +; GFX1164-NEXT: s_mov_b64 exec, s[12:13] ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1164-NEXT: v_readlane_b32 s12, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s14, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s13, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[10:11] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX1164-NEXT: v_readlane_b32 s15, v1, 31 +; GFX1164-NEXT: v_writelane_b32 v3, s14, 16 +; GFX1164-NEXT: s_mov_b64 exec, s[12:13] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s14, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[10:11] +; GFX1164-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX1164-NEXT: v_readlane_b32 s14, v1, 47 +; GFX1164-NEXT: v_writelane_b32 v3, s15, 32 +; GFX1164-NEXT: v_readlane_b32 s15, v1, 63 +; GFX1164-NEXT: s_mov_b64 exec, s[12:13] ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX1164-NEXT: v_writelane_b32 v3, s14, 48 +; GFX1164-NEXT: s_mov_b64 exec, s[12:13] +; GFX1164-NEXT: s_mov_b32 s12, s15 +; GFX1164-NEXT: s_and_b64 s[14:15], vcc, -1 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB1_3 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1164-NEXT: ; %bb.2: ; GFX1164-NEXT: v_mov_b32_e32 v0, s12 ; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc -; GFX1164-NEXT: .LBB1_3: ; GFX1164-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1164-NEXT: .LBB1_3: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_nc_u32_e32 v4, s4, v0 -; GFX1164-NEXT: .LBB1_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: .LBB1_4: ; %Flow ; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] @@ -574,18 +623,21 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_mov_b32 s9, exec_lo ; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_and_b32 s9, s9, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr4 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_mov_b32 s9, s8 -; GFX1132-NEXT: s_and_saveexec_b32 s8, s9 -; GFX1132-NEXT: s_cbranch_execz .LBB1_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s10, s9, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, s9 +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_mov_b32 s9, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s9, -1 +; GFX1132-NEXT: s_or_saveexec_b32 s10, -1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -596,34 +648,37 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s11, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s10, v1, 15 -; GFX1132-NEXT: s_mov_b32 exec_lo, s9 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_mov_b32 exec_lo, s10 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s9, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s10, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s9 +; GFX1132-NEXT: s_or_saveexec_b32 s10, -1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_readlane_b32 s12, v1, 31 +; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: v_readlane_b32 s11, v1, 15 +; GFX1132-NEXT: s_mov_b32 exec_lo, s10 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_or_saveexec_b32 s10, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_writelane_b32 v3, s11, 16 +; GFX1132-NEXT: s_mov_b32 exec_lo, s10 +; GFX1132-NEXT: s_and_b32 s11, vcc_lo, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s9, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB1_3 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1132-NEXT: ; %bb.2: -; GFX1132-NEXT: v_mov_b32_e32 v0, s11 +; GFX1132-NEXT: v_mov_b32_e32 v0, s12 ; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc -; GFX1132-NEXT: .LBB1_3: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX1132-NEXT: .LBB1_3: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_e32 v4, s4, v0 -; GFX1132-NEXT: .LBB1_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: .LBB1_4: ; %Flow ; GFX1132-NEXT: s_wqm_b32 s4, -1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_and_b32 s4, s4, s4 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index ca94d68f01917..476172dde6c82 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -19,12 +19,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX6-NEXT: s_cbranch_execz .LBB0_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -32,8 +34,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -50,9 +52,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -60,8 +64,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -78,9 +82,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -88,8 +94,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -102,12 +108,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-LABEL: add_i32_constant: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -115,9 +123,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -130,11 +138,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -142,9 +152,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -162,8 +172,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -172,8 +184,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -193,8 +205,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -203,8 +217,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -225,8 +239,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -235,8 +251,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -256,8 +272,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -266,8 +284,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -289,13 +307,15 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX6-NEXT: s_cbranch_execz .LBB1_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -303,8 +323,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -318,14 +338,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -333,8 +355,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -348,14 +370,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -363,8 +387,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -379,12 +403,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX10W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -392,9 +418,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -408,11 +434,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX10W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -420,9 +448,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -441,8 +469,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -451,8 +481,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -473,8 +503,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -483,8 +515,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc -; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -506,8 +538,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -516,8 +550,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -538,8 +572,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -548,8 +584,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -600,17 +636,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -641,17 +678,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -681,17 +719,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -720,17 +759,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -762,17 +802,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -803,19 +843,20 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -849,17 +890,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX12W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -890,19 +931,20 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX12W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -1009,12 +1051,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX6-NEXT: s_cbranch_execz .LBB4_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1022,8 +1066,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB4_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: .LBB4_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1041,9 +1085,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB4_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1051,8 +1097,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -1070,9 +1116,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1080,8 +1128,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -1095,12 +1143,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-LABEL: sub_i32_constant: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB4_2 +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1108,9 +1158,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB4_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB4_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1124,11 +1174,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB4_2 +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -1136,9 +1188,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB4_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: .LBB4_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1157,8 +1209,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB4_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1167,8 +1221,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB4_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB4_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1189,8 +1243,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB4_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -1199,8 +1255,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB4_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: .LBB4_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1222,8 +1278,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB4_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1232,8 +1290,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB4_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB4_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1254,8 +1312,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB4_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -1264,8 +1324,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB4_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: .LBB4_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1288,13 +1348,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX6-NEXT: s_cbranch_execz .LBB5_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1302,8 +1364,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1317,14 +1379,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB5_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1332,8 +1396,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1347,14 +1411,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1362,8 +1428,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1378,12 +1444,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX10W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1391,9 +1459,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1408,11 +1476,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX10W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1420,9 +1490,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1442,8 +1512,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1452,8 +1524,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1475,8 +1547,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1485,8 +1559,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc -; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1509,8 +1583,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1519,8 +1595,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1542,8 +1618,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1552,8 +1630,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1605,17 +1683,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB6_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB6_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB6_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -1646,17 +1725,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB6_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB6_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -1686,17 +1766,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10W64-NEXT: s_cbranch_execz .LBB6_4 +; GFX10W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB6_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB6_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -1725,17 +1806,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX10W32-NEXT: s_cbranch_execz .LBB6_4 +; GFX10W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc -; GFX10W32-NEXT: .LBB6_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB6_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -1767,17 +1849,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11W64-NEXT: s_cbranch_execz .LBB6_4 +; GFX11W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB6_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB6_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -1808,19 +1890,20 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX11W32-NEXT: s_cbranch_execz .LBB6_4 +; GFX11W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc -; GFX11W32-NEXT: .LBB6_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB6_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -1855,17 +1938,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX12W64-NEXT: s_cbranch_execz .LBB6_4 +; GFX12W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB6_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB6_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -1896,19 +1979,20 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX12W32-NEXT: s_cbranch_execz .LBB6_4 +; GFX12W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB6_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB6_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 7e15c07f95269..8286423d5e52f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -19,12 +19,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX6-NEXT: s_cbranch_execz .LBB0_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -33,8 +35,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -51,9 +53,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -62,8 +66,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -80,9 +84,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -91,8 +97,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -105,12 +111,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-LABEL: add_i32_constant: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -119,9 +127,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -134,11 +142,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -147,9 +157,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -167,8 +177,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -178,8 +190,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -199,8 +211,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -210,8 +224,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -232,8 +246,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -243,8 +259,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -264,8 +280,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -274,8 +292,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -297,13 +315,15 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX6-NEXT: s_cbranch_execz .LBB1_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -312,8 +332,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -327,14 +347,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -343,8 +365,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -358,14 +380,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -374,8 +398,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -390,12 +414,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX10W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -404,9 +430,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -420,11 +446,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX10W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -433,9 +461,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc -; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -454,8 +482,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -465,8 +495,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -487,8 +517,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -498,8 +530,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -521,8 +553,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -532,8 +566,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -554,8 +588,10 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -564,8 +600,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 @@ -617,18 +653,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -659,18 +696,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -700,18 +738,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -740,18 +779,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX10W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -783,18 +823,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -825,20 +865,21 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX11W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -872,18 +913,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 +; GFX12W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -914,20 +955,21 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 +; GFX12W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -1167,12 +1209,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX6-NEXT: s_cbranch_execz .LBB5_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1181,8 +1225,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1200,9 +1244,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB5_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1211,8 +1257,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -1230,9 +1276,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1241,8 +1289,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -1256,12 +1304,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-LABEL: sub_i32_constant: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1270,9 +1320,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1286,11 +1336,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -1299,9 +1351,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1320,8 +1372,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1331,8 +1385,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1353,8 +1407,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -1364,8 +1420,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1387,8 +1443,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1398,8 +1456,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 @@ -1420,8 +1478,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -1430,8 +1490,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 @@ -1454,13 +1514,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX6-NEXT: s_cbranch_execz .LBB6_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1469,8 +1531,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX6-NEXT: .LBB6_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: .LBB6_2: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -1484,14 +1546,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB6_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1500,8 +1564,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1515,14 +1579,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1531,8 +1597,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1547,12 +1613,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec -; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX10W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1561,9 +1629,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1578,11 +1646,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32: ; %bb.0: ; %entry ; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo -; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX10W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1591,9 +1661,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc -; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1613,8 +1683,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1624,8 +1696,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB6_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB6_2: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1647,8 +1719,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1658,8 +1732,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc -; GFX11W32-NEXT: .LBB6_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB6_2: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1682,8 +1756,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX12W64-NEXT: ; %bb.1: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1693,8 +1769,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB6_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB6_2: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 @@ -1716,8 +1792,10 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: ; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1726,8 +1804,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB6_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB6_2: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1780,18 +1858,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8-NEXT: s_cbranch_execz .LBB7_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -1822,18 +1901,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -1863,18 +1943,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX10W64-NEXT: s_cmov_b64 exec, vcc +; GFX10W64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -1903,18 +1984,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX10W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10W32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, v2, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -1946,18 +2028,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX11W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX11W64-NEXT: s_cmov_b64 exec, vcc +; GFX11W64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -1988,20 +2070,21 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX11W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX11W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11W32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB7_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: .LBB7_4: ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 @@ -2036,18 +2119,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 -; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 +; GFX12W64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX12W64-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX12W64-NEXT: s_cmov_b64 exec, vcc +; GFX12W64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: .LBB7_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: .LBB7_4: ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 @@ -2078,20 +2161,21 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12W32-NEXT: ; implicit-def: $vgpr0 -; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 +; GFX12W32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX12W32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12W32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: .LBB7_4: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: .LBB7_4: ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index c9076a9541b23..6f660fab190ad 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -21,10 +21,11 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -45,10 +46,11 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -79,11 +81,12 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX1100-NEXT: buffer_gl0_inv ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX1100-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1100-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1100-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1100-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1100-NEXT: s_and_b32 s2, s1, -1 +; GFX1100-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1100-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1100-NEXT: v_mov_b32_e32 v0, v3 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; @@ -108,11 +111,12 @@ define float @syncscope_system(ptr %addr, float %val) #0 { ; GFX1200-NEXT: global_inv scope:SCOPE_SYS ; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX1200-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1200-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1200-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1200-NEXT: s_and_b32 s2, s1, -1 +; GFX1200-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1200-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1200-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1200-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: v_mov_b32_e32 v0, v3 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val seq_cst @@ -134,10 +138,11 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -146,24 +151,29 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB1_6 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX90A-NEXT: ; implicit-def: $vgpr3 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB1_3 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: .LBB1_3: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB1_5 +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_and_b64 s[10:11], s[6:7], -1 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -171,21 +181,24 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: .LBB1_5: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: .LBB1_6: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB1_8 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_8 ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -228,48 +241,51 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_3 -; GFX908-NEXT: ; %bb.1: ; %Flow2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB2_8 -; GFX908-NEXT: .LBB2_2: ; %atomicrmw.phi -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; GFX908-NEXT: .LBB2_3: ; %atomicrmw.check.private +; GFX908-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX908-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB2_6 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB2_5 -; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX908-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX908-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB2_3 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: .LBB2_5: ; %Flow -; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX908-NEXT: s_cbranch_execz .LBB2_7 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB2_3: ; %Flow +; GFX908-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX908-NEXT: s_and_b64 s[10:11], s[6:7], -1 +; GFX908-NEXT: s_cmov_b64 exec, s[6:7] +; GFX908-NEXT: s_cbranch_scc0 .LBB2_5 +; GFX908-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX908-NEXT: .LBB2_7: ; %Flow1 -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: .LBB2_5: ; %Flow1 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_cbranch_execz .LBB2_2 -; GFX908-NEXT: .LBB2_8: ; %atomicrmw.shared +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: .LBB2_6: ; %Flow2 +; GFX908-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX908-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX908-NEXT: s_cmov_b64 exec, s[4:5] +; GFX908-NEXT: s_cbranch_scc0 .LBB2_8 +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX908-NEXT: ds_add_f32 v0, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: .LBB2_8: ; %atomicrmw.phi ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -278,48 +294,51 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB2_8 -; GFX90A-NEXT: .LBB2_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.check.private +; GFX90A-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX90A-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB2_5 -; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX90A-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX90A-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: .LBB2_5: ; %Flow -; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] -; GFX90A-NEXT: s_cbranch_execz .LBB2_7 -; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB2_3: ; %Flow +; GFX90A-NEXT: s_xor_b64 s[8:9], s[6:7], exec +; GFX90A-NEXT: s_and_b64 s[10:11], s[6:7], -1 +; GFX90A-NEXT: s_cmov_b64 exec, s[6:7] +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX90A-NEXT: .LBB2_7: ; %Flow1 -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: .LBB2_5: ; %Flow1 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB2_2 -; GFX90A-NEXT: .LBB2_8: ; %atomicrmw.shared +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: .LBB2_6: ; %Flow2 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX90A-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX90A-NEXT: s_cmov_b64 exec, s[4:5] +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX90A-NEXT: ds_add_f32 v0, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: .LBB2_8: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -371,10 +390,11 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB3_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -392,10 +412,11 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -423,11 +444,12 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX1100-NEXT: buffer_gl0_inv ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX1100-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1100-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1100-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1100-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1100-NEXT: s_and_b32 s2, s1, -1 +; GFX1100-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1100-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1100-NEXT: v_mov_b32_e32 v0, v3 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; @@ -452,11 +474,12 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX1200-NEXT: global_inv scope:SCOPE_SE ; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX1200-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1200-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1200-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1200-NEXT: s_and_b32 s2, s1, -1 +; GFX1200-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1200-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1200-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1200-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1200-NEXT: v_mov_b32_e32 v0, v3 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll index f9a43dd61c8cf..584800dd7bca8 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -17,10 +17,11 @@ define i32 @atomic_nand_i32_lds(ptr addrspace(3) %ptr) nounwind { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB0_1 +; GCN-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB0_1 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(3) %ptr, i32 4 seq_cst @@ -44,10 +45,11 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB1_1 +; GCN-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB1_1 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst @@ -71,10 +73,11 @@ define i32 @atomic_nand_i32_flat(ptr %ptr) nounwind { ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_1 +; GCN-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB2_1 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i32 4 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll index bc9008c6f1745..b24c1fed19209 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=atomic-expand \ ; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-CAS diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll index 3ed2cb856eaea..da1a3b3786f07 100644 --- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll @@ -18,15 +18,14 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) { ; REGALLOC-NEXT: renamable $sgpr6 = IMPLICIT_DEF ; REGALLOC-NEXT: renamable $vgpr1 = COPY killed renamable $sgpr6 ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) - ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec - ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = S_XOR_B64 renamable $sgpr4_sgpr5, killed renamable $sgpr6_sgpr7, implicit-def dead $scc + ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = S_XOR_B64 renamable $sgpr4_sgpr5, $exec, implicit-def dead $scc ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 0, $vgpr0, implicit-def $sgpr6_sgpr7, implicit $sgpr6_sgpr7 ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 1, $vgpr0, implicit killed $sgpr6_sgpr7 ; REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; REGALLOC-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr4_sgpr5 - ; REGALLOC-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; REGALLOC-NEXT: S_BRANCH %bb.3 + ; REGALLOC-NEXT: dead renamable $sgpr6_sgpr7 = S_AND_B64 renamable $sgpr4_sgpr5, -1, implicit-def $scc + ; REGALLOC-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr4_sgpr5, implicit $scc + ; REGALLOC-NEXT: S_CBRANCH_SCC1 %bb.3, implicit killed $scc + ; REGALLOC-NEXT: S_BRANCH %bb.1 ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: bb.1.Flow: ; REGALLOC-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) @@ -34,40 +33,45 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) { ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr4_sgpr5 ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1 - ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def dead $scc, implicit $exec ; REGALLOC-NEXT: $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) - ; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr5, 3, $vgpr0, implicit $sgpr4_sgpr5 + ; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = S_XOR_B64 renamable $sgpr4_sgpr5, $exec, implicit-def dead $scc + ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 2, $vgpr0, implicit-def $sgpr6_sgpr7, implicit $sgpr6_sgpr7 + ; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 3, $vgpr0, implicit killed $sgpr6_sgpr7 ; REGALLOC-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; REGALLOC-NEXT: $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; REGALLOC-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec - ; REGALLOC-NEXT: S_BRANCH %bb.2 + ; REGALLOC-NEXT: dead renamable $sgpr6_sgpr7 = S_AND_B64 renamable $sgpr4_sgpr5, -1, implicit-def $scc + ; REGALLOC-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr4_sgpr5, implicit $scc + ; REGALLOC-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc + ; REGALLOC-NEXT: S_BRANCH %bb.4 ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: bb.2.bb.1: ; REGALLOC-NEXT: successors: %bb.4(0x80000000) ; REGALLOC-NEXT: {{ $}} + ; REGALLOC-NEXT: $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5 + ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3 ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) - ; REGALLOC-NEXT: renamable $sgpr4 = S_MOV_B32 10 - ; REGALLOC-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, killed $sgpr4, 0, implicit $exec + ; REGALLOC-NEXT: renamable $sgpr6 = S_MOV_B32 10 + ; REGALLOC-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, killed $sgpr6, 0, implicit $exec ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) + ; REGALLOC-NEXT: $exec = S_OR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; REGALLOC-NEXT: S_BRANCH %bb.4 ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: bb.3.bb.2: ; REGALLOC-NEXT: successors: %bb.1(0x80000000) ; REGALLOC-NEXT: {{ $}} + ; REGALLOC-NEXT: $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0, implicit-def $sgpr4_sgpr5 + ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1 ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) - ; REGALLOC-NEXT: renamable $sgpr4 = S_MOV_B32 20 - ; REGALLOC-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, killed $sgpr4, 0, implicit $exec + ; REGALLOC-NEXT: renamable $sgpr6 = S_MOV_B32 20 + ; REGALLOC-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, killed $sgpr6, 0, implicit $exec ; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; REGALLOC-NEXT: $exec = S_OR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; REGALLOC-NEXT: S_BRANCH %bb.1 ; REGALLOC-NEXT: {{ $}} ; REGALLOC-NEXT: bb.4.bb.3: ; REGALLOC-NEXT: $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) - ; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5 - ; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3 - ; REGALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) ; REGALLOC-NEXT: renamable $vgpr0 = V_LSHL_ADD_U32_e64 killed $vgpr0, 2, $vgpr0, implicit $exec ; REGALLOC-NEXT: KILL killed renamable $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir b/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir index 6483ff28c0de0..ef5d5df26695d 100644 --- a/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir +++ b/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir @@ -22,12 +22,11 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY1]], implicit $exec ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], killed [[V_CMP_NE_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_AND_B32_]], [[COPY3]], implicit-def dead $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term killed [[S_AND_B32_]] - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], 4294967295, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[V_CMP_NE_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.7(0x80000000) @@ -39,6 +38,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8), addrspace 1) + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, %12, implicit-def $scc ; CHECK-NEXT: S_BRANCH %bb.7 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -61,12 +61,12 @@ body: | ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[S_OR_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_OR_SAVEEXEC_B32 killed [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[S_OR_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_B32_1]], implicit-def $scc - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] + ; CHECK-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_XOR_B32_]], $exec_lo, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_]], 4294967295, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[S_XOR_B32_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.7 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: ; CHECK-NEXT: successors: %bb.5(0x80000000) @@ -75,7 +75,6 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, killed [[S_AND_B32_1]], implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.2(0x40000000), %bb.5(0x40000000) @@ -97,6 +96,7 @@ body: | %10:vgpr_32 = GLOBAL_LOAD_UBYTE killed %9, 0, 0, implicit $exec :: (load (s8), addrspace 1) %11:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec GLOBAL_STORE_BYTE killed %11, killed %10, 0, 0, implicit $exec :: (store (s8), addrspace 1) + SI_WAVE_RECONVERGE %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.7 bb.2: @@ -128,7 +128,6 @@ body: | S_BRANCH %bb.5 bb.7: - SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll index cc05129b1b2af..78c44649fa2d8 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 384715a849c1e..86e00a2df2ae4 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -18,7 +18,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_MOV_B64 -1 @@ -28,13 +28,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr26_sgpr27 = S_XOR_B64 killed renamable $sgpr26_sgpr27, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3) - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr24_sgpr25, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.2, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: - ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: successors: %bb.57(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr40_sgpr41, $sgpr48_sgpr49:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc @@ -43,11 +43,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $vgpr24 = IMPLICIT_DEF ; GFX90A-NEXT: $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.59, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.57, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54, $sgpr55, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr48, $sgpr49, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF @@ -59,41 +59,41 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: - ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.58(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.56(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr48_sgpr49:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.58, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.56, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr48_sgpr49:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr4 = COPY renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr16, $vgpr0, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr4, killed $vgpr1, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr44, renamable $vcc = V_ADD_CO_U32_e64 $sgpr16, $vgpr0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr4, killed $vgpr1, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 2, $vgpr30, implicit $exec - ; GFX90A-NEXT: renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr46, killed $vgpr0, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr47, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr44, killed $vgpr0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr45, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.35, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -103,7 +103,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr46_vgpr47 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF @@ -117,7 +117,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr19 = COPY renamable $sgpr15, implicit $exec ; GFX90A-NEXT: renamable $vgpr18 = COPY $sgpr15, implicit $exec @@ -129,217 +129,242 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr24 = COPY $sgpr15, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: - ; GFX90A-NEXT: successors: %bb.63(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: successors: %bb.61(0x40000000), %bb.8(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 - ; GFX90A-NEXT: $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $sgpr28_sgpr29, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.63, implicit $exec + ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_AND_B64 killed renamable $sgpr28_sgpr29, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr48_sgpr49 = S_AND_B64 renamable $sgpr28_sgpr29, -1, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr28_sgpr29, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.61, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.8.Flow32: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.10, implicit $exec + ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr20_sgpr21 = S_AND_B64 renamable $sgpr18_sgpr19, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr18_sgpr19, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.10, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.9.bb89: ; GFX90A-NEXT: successors: %bb.10(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec + ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr20_sgpr21 = S_AND_B64 renamable $sgpr18_sgpr19, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr18_sgpr19, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.12, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec + ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr20_sgpr21 = S_AND_B64 renamable $sgpr18_sgpr19, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr18_sgpr19, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.14, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: ; GFX90A-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr16_sgpr17, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.16, implicit $exec + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr16_sgpr17 = S_AND_B64 renamable $sgpr8_sgpr9, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr8_sgpr9, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.16, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.15.bb72: ; GFX90A-NEXT: successors: %bb.16(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr8 = S_ADD_U32 renamable $sgpr6, 48, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr7, 0, implicit-def dead $scc, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @f2, target-flags(amdgpu-gotprel32-hi) @f2, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM killed renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit undef $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.16.Flow36: ; GFX90A-NEXT: successors: %bb.17(0x40000000), %bb.18(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.18, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr8_sgpr9 = S_AND_B64 renamable $sgpr6_sgpr7, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.18, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.17.bb67: ; GFX90A-NEXT: successors: %bb.18(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.18.Flow37: ; GFX90A-NEXT: successors: %bb.19(0x40000000), %bb.20(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.20, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr8_sgpr9 = S_AND_B64 renamable $sgpr6_sgpr7, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.20, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.19.bb62: ; GFX90A-NEXT: successors: %bb.20(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.20.Flow38: ; GFX90A-NEXT: successors: %bb.21(0x40000000), %bb.22(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.22, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr8_sgpr9 = S_AND_B64 renamable $sgpr6_sgpr7, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.22, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.21.bb54: ; GFX90A-NEXT: successors: %bb.22(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.22.Flow39: ; GFX90A-NEXT: successors: %bb.23(0x40000000), %bb.24(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.24, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr8_sgpr9 = S_AND_B64 renamable $sgpr6_sgpr7, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.24, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.23.bb47: ; GFX90A-NEXT: successors: %bb.24(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.24.Flow40: ; GFX90A-NEXT: successors: %bb.25(0x40000000), %bb.26(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.26, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr38_sgpr39, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr8_sgpr9 = S_AND_B64 renamable $sgpr6_sgpr7, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.26, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.25.bb40: ; GFX90A-NEXT: successors: %bb.26(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr42_sgpr43, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.26.Flow41: ; GFX90A-NEXT: successors: %bb.27(0x40000000), %bb.28(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr42_sgpr43, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.28, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr8_sgpr9 = S_AND_B64 renamable $sgpr6_sgpr7, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.28, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.27.bb33: ; GFX90A-NEXT: successors: %bb.28(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.28.Flow42: ; GFX90A-NEXT: successors: %bb.34(0x40000000), %bb.29(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.34, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr8_sgpr9 = S_AND_B64 renamable $sgpr6_sgpr7, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr6_sgpr7, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.34, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.29.Flow43: ; GFX90A-NEXT: successors: %bb.30(0x40000000), %bb.31(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.31, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.30.bb19: ; GFX90A-NEXT: successors: %bb.31(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.31.Flow44: ; GFX90A-NEXT: successors: %bb.32(0x40000000), %bb.33(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr54_sgpr55, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr48_sgpr49, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.33, implicit $exec + ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr6_sgpr7 = S_AND_B64 renamable $sgpr4_sgpr5, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr4_sgpr5, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.33, implicit $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock: ; GFX90A-NEXT: successors: %bb.33(0x80000000) @@ -354,32 +379,35 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.34.bb26: ; GFX90A-NEXT: successors: %bb.29(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr48_sgpr49, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.29 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: - ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 + ; GFX90A-NEXT: successors: %bb.36(0x40000000), %bb.6(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr48_sgpr49:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = COPY $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 + ; GFX90A-NEXT: dead renamable $sgpr42_sgpr43 = S_AND_B64 renamable $vcc, -1, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -388,7 +416,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr46_vgpr47 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF @@ -398,28 +426,24 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.36.Flow21: - ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc - ; GFX90A-NEXT: S_BRANCH %bb.6 + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.6, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.37.bb27: - ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr40_sgpr41 + ; GFX90A-NEXT: bb.36.bb27: + ; GFX90A-NEXT: successors: %bb.38(0x40000000), %bb.37(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr48_sgpr49:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = COPY $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) - ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr46 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 + ; GFX90A-NEXT: dead renamable $sgpr38_sgpr39 = S_AND_B64 renamable $vcc, -1, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -437,40 +461,43 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.38, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.38.Flow22: - ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.37.Flow22: + ; GFX90A-NEXT: successors: %bb.6(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_AND_B64 killed renamable $sgpr38_sgpr39, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_ANDN2_B64 killed renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_OR_B64 killed renamable $sgpr28_sgpr29, killed renamable $sgpr54_sgpr55, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.36 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_OR_B64 killed renamable $sgpr28_sgpr29, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.39.bb34: - ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: bb.38.bb34: + ; GFX90A-NEXT: successors: %bb.40(0x40000000), %bb.39(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr48_sgpr49:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr46_sgpr47, $sgpr62_sgpr63 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = COPY $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: dead renamable $sgpr16_sgpr17 = S_AND_B64 renamable $vcc, -1, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -487,42 +514,45 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.40, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.40.Flow23: - ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.39.Flow23: + ; GFX90A-NEXT: successors: %bb.37(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr38_sgpr39, killed renamable $sgpr40_sgpr41, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.38 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr44_sgpr45, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.37 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.41.bb41: - ; GFX90A-NEXT: successors: %bb.47(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: bb.40.bb41: + ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.41(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr48_sgpr49:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr46_sgpr47, $sgpr62_sgpr63 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = COPY $exec ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr59, dead renamable $sgpr16_sgpr17 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr16_sgpr17, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec :: (load (s8) from %ir.i42, addrspace 1) ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: dead renamable $sgpr42_sgpr43 = S_AND_B64 renamable $vcc, -1, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -538,47 +568,47 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.47, implicit $exec + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.46, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.42.Flow24: - ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.41.Flow24: + ; GFX90A-NEXT: successors: %bb.39(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr16_sgpr17, killed renamable $sgpr54_sgpr55, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.40 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_OR_B64 killed renamable $sgpr16_sgpr17, killed renamable $sgpr44_sgpr45, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.39 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.43.bb55: - ; GFX90A-NEXT: successors: %bb.49(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr44_sgpr45 + ; GFX90A-NEXT: bb.42.bb55: + ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.43(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_XOR_B64 renamable $sgpr62_sgpr63, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_XOR_B64 renamable $sgpr60_sgpr61, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr46_sgpr47, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.49, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.48, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.44: - ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: bb.43: + ; GFX90A-NEXT: successors: %bb.44(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -594,43 +624,45 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.45.Flow26: - ; GFX90A-NEXT: successors: %bb.46(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr16, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr60, $vgpr61, $vgpr62, $vgpr63 + ; GFX90A-NEXT: bb.44.Flow26: + ; GFX90A-NEXT: successors: %bb.45(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr16, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr60, $vgpr61, $vgpr62, $vgpr63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.46.Flow26: - ; GFX90A-NEXT: successors: %bb.48(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.45.Flow26: + ; GFX90A-NEXT: successors: %bb.47(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.48 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.47 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.47.bb48: - ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.48(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr44_sgpr45 + ; GFX90A-NEXT: bb.46.bb48: + ; GFX90A-NEXT: successors: %bb.42(0x40000000), %bb.47(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = COPY $exec ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec :: (load (s8) from %ir.i49, addrspace 1) - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr28_sgpr29 - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr16_sgpr17, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 + ; GFX90A-NEXT: dead renamable $sgpr44_sgpr45 = S_AND_B64 renamable $vcc, -1, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -646,39 +678,40 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr16_sgpr17 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.42, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.48.Flow25: - ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.47.Flow25: + ; GFX90A-NEXT: successors: %bb.41(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr16_sgpr17, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr54_sgpr55, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.42 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr46_sgpr47, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.41 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.49.bb63: - ; GFX90A-NEXT: successors: %bb.51(0x40000000), %bb.50(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: bb.48.bb63: + ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) + ; GFX90A-NEXT: liveins: $vcc, $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49:0x000000000000000F, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.51, implicit $vcc + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.50: - ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: bb.49: + ; GFX90A-NEXT: successors: %bb.44(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -692,24 +725,24 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: S_BRANCH %bb.45 + ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.51.bb68: - ; GFX90A-NEXT: successors: %bb.55(0x40000000), %bb.52(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: bb.50.bb68: + ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.51(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49:0x000000000000000F, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr46_sgpr47, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.55, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.52, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.52: - ; GFX90A-NEXT: successors: %bb.46(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: bb.51: + ; GFX90A-NEXT: successors: %bb.45(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -722,26 +755,25 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: S_BRANCH %bb.46 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.53.bb80: - ; GFX90A-NEXT: successors: %bb.60(0x40000000), %bb.54(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr15 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc - ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr15, 0, implicit-def $scc - ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr48_sgpr49 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.60, implicit killed $scc + ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.54: - ; GFX90A-NEXT: successors: %bb.62(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.52.bb73: + ; GFX90A-NEXT: successors: %bb.53(0x40000000), %bb.45(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49:0x000000000000000F, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY $exec + ; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) + ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr52_sgpr53 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 + ; GFX90A-NEXT: dead renamable $sgpr56_sgpr57 = S_AND_B64 renamable $vcc, -1, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF @@ -751,22 +783,27 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: S_BRANCH %bb.62 + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.45, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.55.bb73: - ; GFX90A-NEXT: successors: %bb.53(0x40000000), %bb.56(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51 + ; GFX90A-NEXT: bb.53.bb80: + ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.54(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49:0x000000000000000F, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) - ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 - ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr56_sgpr57 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr15 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc + ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr15, 0, implicit-def $scc + ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr50_sgpr51 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.58, implicit killed $scc + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.54: + ; GFX90A-NEXT: successors: %bb.60(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF @@ -776,51 +813,44 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr58_sgpr59 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.53, implicit $exec + ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.56.Flow29: - ; GFX90A-NEXT: successors: %bb.46(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.55.bb90: + ; GFX90A-NEXT: successors: %bb.59(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49:0x000000000000000F, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr58_sgpr59, implicit-def $scc - ; GFX90A-NEXT: S_BRANCH %bb.46 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.57.bb90: - ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr62_sgpr63, implicit $exec + ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr60_sgpr61, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr54, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr55, killed $vgpr10, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr48, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr49, killed $vgpr10, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr8_sgpr9, implicit $exec ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.61 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.59 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.58: + ; GFX90A-NEXT: bb.56: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $exec, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $exec, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr40_sgpr41, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr15, implicit $exec ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -830,10 +860,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr46_vgpr47 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr40_vgpr41 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr46_vgpr47 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr14 = COPY renamable $vgpr15, implicit $exec ; GFX90A-NEXT: renamable $vgpr52 = COPY renamable $vgpr15, implicit $exec ; GFX90A-NEXT: renamable $vgpr16 = COPY renamable $vgpr15, implicit $exec @@ -843,9 +873,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.7 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.59.bb105: + ; GFX90A-NEXT: bb.57.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr48_sgpr49:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) @@ -862,17 +892,19 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr15 = S_MOV_B32 0 ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.60.bb85: - ; GFX90A-NEXT: successors: %bb.57(0x40000000), %bb.61(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.58.bb85: + ; GFX90A-NEXT: successors: %bb.55(0x40000000), %bb.59(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49:0x000000000000000F, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY $exec ; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) ; GFX90A-NEXT: renamable $sgpr15 = S_MOV_B32 0 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 + ; GFX90A-NEXT: dead renamable $sgpr54_sgpr55 = S_AND_B64 renamable $vcc, -1, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF @@ -881,70 +913,80 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr50_sgpr51 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.57, implicit $exec + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.55, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.61.Flow31: - ; GFX90A-NEXT: successors: %bb.62(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.59.Flow31: + ; GFX90A-NEXT: successors: %bb.60(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.62.Flow30: - ; GFX90A-NEXT: successors: %bb.56(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.60.Flow30: + ; GFX90A-NEXT: successors: %bb.45(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, killed renamable $sgpr54_sgpr55, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.56 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr48_sgpr49, killed renamable $sgpr50_sgpr51, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr58_sgpr59, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.63.bb140: - ; GFX90A-NEXT: successors: %bb.69(0x40000000), %bb.64(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.61.bb140: + ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.62(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr26_sgpr27, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.69, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.68, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.64.Flow13: - ; GFX90A-NEXT: successors: %bb.65(0x40000000), %bb.67(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.62.Flow13: + ; GFX90A-NEXT: successors: %bb.63(0x40000000), %bb.66(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.67, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.65.bb159: - ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.63.bb159: + ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.64(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.68, implicit $exec + ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 renamable $vcc, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr18_sgpr19 = S_AND_B64 renamable $vcc, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $vcc, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.67, implicit $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.66.Flow10: - ; GFX90A-NEXT: successors: %bb.67(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.64.Flow10: + ; GFX90A-NEXT: successors: %bb.65(0x40000000), %bb.66(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_ANDN2_SAVEEXEC_B64 $sgpr8_sgpr9, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr8_sgpr9, $exec, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr20_sgpr21 = S_AND_B64 renamable $sgpr8_sgpr9, -1, implicit-def $scc + ; GFX90A-NEXT: $exec = S_CMOV_B64_term killed renamable $sgpr8_sgpr9, implicit $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.66, implicit $scc + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.65.bb160: + ; GFX90A-NEXT: successors: %bb.66(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr18_sgpr19, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.67.Flow14: + ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY $exec + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.68.bb161: - ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.67.bb161: + ; GFX90A-NEXT: successors: %bb.64(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec @@ -959,11 +1001,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) - ; GFX90A-NEXT: S_BRANCH %bb.66 + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, renamable $sgpr8_sgpr9, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.64 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.69.bb174: - ; GFX90A-NEXT: successors: %bb.73(0x40000000), %bb.70(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.68.bb174: + ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec @@ -975,18 +1018,18 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr8_sgpr9, implicit $exec ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.73, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.70.Flow: - ; GFX90A-NEXT: successors: %bb.71(0x40000000), %bb.72(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.69.Flow: + ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.71.bb186: - ; GFX90A-NEXT: successors: %bb.72(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.70.bb186: + ; GFX90A-NEXT: successors: %bb.71(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr19, implicit $exec @@ -1013,23 +1056,23 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.72.Flow9: - ; GFX90A-NEXT: successors: %bb.64(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.71.Flow9: + ; GFX90A-NEXT: successors: %bb.62(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 0 - ; GFX90A-NEXT: S_BRANCH %bb.64 + ; GFX90A-NEXT: S_BRANCH %bb.62 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.73.bb196: - ; GFX90A-NEXT: successors: %bb.70(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.72.bb196: + ; GFX90A-NEXT: successors: %bb.69(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec ; GFX90A-NEXT: renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_MOV_B64 0 - ; GFX90A-NEXT: S_BRANCH %bb.70 + ; GFX90A-NEXT: S_BRANCH %bb.69 bb: %i = tail call i32 @llvm.amdgcn.workitem.id.x() %i11 = icmp eq i32 %i, 0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll index 903bc85ed6616..87ef96fd46be0 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 2f637df4e9302..d0bdf0d0d5690 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -167,17 +167,19 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: v_addc_u32_e64 v1, s[0:1], 0, v1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_cbranch_execnz .LBB3_1 +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN-NEXT: ; %bb.3: ; %bb -; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[0:1] ; GCN-NEXT: .Lpost_getpc2: -; GCN-NEXT: s_add_u32 s4, s4, (.LBB3_2-.Lpost_getpc2)&4294967295 -; GCN-NEXT: s_addc_u32 s5, s5, (.LBB3_2-.Lpost_getpc2)>>32 -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s0, s0, (.LBB3_2-.Lpost_getpc2)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB3_2-.Lpost_getpc2)>>32 +; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB3_1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; 32 bytes @@ -186,8 +188,8 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB3_2: ; %bb3 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -444,13 +446,15 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GCN-LABEL: uniform_inside_divergent: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execnz .LBB8_1 -; GCN-NEXT: ; %bb.4: ; %entry +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_and_b64 s[2:3], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc1 .LBB8_1 +; GCN-NEXT: ; %bb.5: ; %entry ; GCN-NEXT: s_getpc_b64 s[0:1] ; GCN-NEXT: .Lpost_getpc9: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB8_3-.Lpost_getpc9)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB8_3-.Lpost_getpc9)>>32 +; GCN-NEXT: s_add_u32 s0, s0, (.LBB8_4-.Lpost_getpc9)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB8_4-.Lpost_getpc9)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB8_1: ; %if ; GCN-NEXT: s_load_dword s6, s[0:1], 0xb @@ -466,8 +470,9 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: .LBB8_3: ; %endif +; GCN-NEXT: .LBB8_3: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB8_4: ; %endif ; GCN-NEXT: s_sleep 5 ; GCN-NEXT: s_endpgm entry: @@ -500,23 +505,33 @@ define amdgpu_kernel void @analyze_mask_branch() #0 { ; GCN-NEXT: v_mov_b32_e64 v0, 0 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %ret +; GCN-NEXT: s_xor_b64 s[0:1], vcc, exec +; GCN-NEXT: s_and_b64 s[2:3], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc1 .LBB9_1 +; GCN-NEXT: ; %bb.6: ; %entry +; GCN-NEXT: s_getpc_b64 s[2:3] +; GCN-NEXT: .Lpost_getpc10: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB9_2-.Lpost_getpc10)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB9_2-.Lpost_getpc10)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] +; GCN-NEXT: .LBB9_1: ; %ret ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 7 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: .LBB9_2: ; %Flow1 -; GCN-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.6: ; %Flow1 +; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GCN-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB9_3 +; GCN-NEXT: ; %bb.8: ; %Flow1 ; GCN-NEXT: s_getpc_b64 s[0:1] -; GCN-NEXT: .Lpost_getpc10: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc10)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc10)>>32 +; GCN-NEXT: .Lpost_getpc11: +; GCN-NEXT: s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc11)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc11)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB9_3: ; %loop.preheader ; GCN-NEXT: s_and_b64 vcc, exec, 0 @@ -534,12 +549,12 @@ define amdgpu_kernel void @analyze_mask_branch() #0 { ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_mov_b64 vcc, vcc ; GCN-NEXT: s_cbranch_vccnz .LBB9_5 -; GCN-NEXT: ; %bb.8: ; %loop +; GCN-NEXT: ; %bb.10: ; %loop ; GCN-NEXT: ; in Loop: Header=BB9_4 Depth=1 ; GCN-NEXT: s_getpc_b64 s[0:1] -; GCN-NEXT: .Lpost_getpc11: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc11)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc11)>>32 +; GCN-NEXT: .Lpost_getpc12: +; GCN-NEXT: s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc12)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc12)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB9_5: ; %UnifiedReturnBlock ; GCN-NEXT: s_endpgm @@ -582,9 +597,9 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_cbranch_scc1 .LBB10_1 ; GCN-NEXT: ; %bb.8: ; %bb ; GCN-NEXT: s_getpc_b64 s[8:9] -; GCN-NEXT: .Lpost_getpc12: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc12)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc12)>>32 +; GCN-NEXT: .Lpost_getpc13: +; GCN-NEXT: s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc13)&4294967295 +; GCN-NEXT: s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc13)>>32 ; GCN-NEXT: s_setpc_b64 s[8:9] ; GCN-NEXT: .LBB10_1: ; %bb13 ; GCN-NEXT: ;;#ASMSTART @@ -608,9 +623,9 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-NEXT: ; %bb.10: ; %Flow5 ; GCN-NEXT: s_getpc_b64 s[2:3] -; GCN-NEXT: .Lpost_getpc13: -; GCN-NEXT: s_add_u32 s2, s2, (.LBB10_6-.Lpost_getpc13)&4294967295 -; GCN-NEXT: s_addc_u32 s3, s3, (.LBB10_6-.Lpost_getpc13)>>32 +; GCN-NEXT: .Lpost_getpc14: +; GCN-NEXT: s_add_u32 s2, s2, (.LBB10_6-.Lpost_getpc14)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB10_6-.Lpost_getpc14)>>32 ; GCN-NEXT: s_setpc_b64 s[2:3] ; GCN-NEXT: .LBB10_5: ; %bb14 ; GCN-NEXT: s_cmp_lt_i32 s5, 9 diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll index 82808cd309227..feafdc07ed78c 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=ISA ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -stop-before=si-fix-sgpr-copies < %s | FileCheck %s -check-prefix=MIR @@ -30,78 +29,18 @@ define void @f(i32 %arg, ptr %ptr) { ; ISA-NEXT: v_mov_b32_e32 v7, v6 ; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo ; ISA-NEXT: s_or_b32 s4, s5, s4 +; ISA-NEXT: s_andn2_b32 s5, exec_lo, s4 ; ISA-NEXT: v_add_f32_e32 v6, v7, v0 +; ISA-NEXT: s_and_b32 s6, s5, -1 ; ISA-NEXT: v_add_f32_e64 v6, v6, |v3| ; ISA-NEXT: v_add_f32_e32 v6, v6, v4 ; ISA-NEXT: v_add_f32_e32 v6, v6, v5 -; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; ISA-NEXT: s_cbranch_execnz .LBB0_1 +; ISA-NEXT: s_cselect_b32 exec_lo, s5, s4 +; ISA-NEXT: s_cbranch_scc1 .LBB0_1 ; ISA-NEXT: ; %bb.2: ; %bb21 -; ISA-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; ISA-NEXT: flat_store_dword v[1:2], v7 ; ISA-NEXT: s_waitcnt lgkmcnt(0) ; ISA-NEXT: s_setpc_b64 s[30:31] - ; MIR-LABEL: name: f - ; MIR: bb.0.bb: - ; MIR-NEXT: successors: %bb.1(0x80000000) - ; MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; MIR-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; MIR-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; MIR-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[S_MOV_B64_]], 0, 0 :: (invariant load (s64) from `ptr addrspace(4) null`, align 4294967296, addrspace 4) - ; MIR-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 - ; MIR-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; MIR-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc - ; MIR-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_]], [[COPY5]], implicit-def dead $scc - ; MIR-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; MIR-NEXT: S_CMP_LG_U32 [[COPY5]], [[S_MOV_B32_1]], implicit-def $scc - ; MIR-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc - ; MIR-NEXT: $scc = COPY [[COPY6]] - ; MIR-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_1]], implicit $scc - ; MIR-NEXT: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_]] - ; MIR-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 - ; MIR-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 - ; MIR-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]] - ; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_3]], 0, [[COPY8]], [[COPY6]], implicit $exec - ; MIR-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]] - ; MIR-NEXT: $scc = COPY [[COPY6]] - ; MIR-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_1]], implicit $scc - ; MIR-NEXT: [[V_CVT_F32_UBYTE0_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e64 killed [[S_CSELECT_B32_1]], 0, 0, implicit $exec - ; MIR-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]] - ; MIR-NEXT: $scc = COPY [[COPY6]] - ; MIR-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY4]], [[S_MOV_B32_1]], implicit $scc - ; MIR-NEXT: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]] - ; MIR-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec - ; MIR-NEXT: [[COPY12:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]] - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: bb.1.bb14: - ; MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %7, %bb.1 - ; MIR-NEXT: [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_3]], %bb.0, %8, %bb.1 - ; MIR-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY12]] - ; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY13]], [[PHI]], implicit-def dead $scc - ; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY9]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 2, [[COPY7]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]] - ; MIR-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; MIR-NEXT: S_BRANCH %bb.2 - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: bb.2.bb21: - ; MIR-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1 - ; MIR-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1 - ; MIR-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; MIR-NEXT: FLAT_STORE_DWORD [[COPY3]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr) - ; MIR-NEXT: SI_RETURN bb: %i = load <2 x i32>, ptr addrspace(4) null, align 4294967296 %i1 = extractelement <2 x i32> %i, i64 1 @@ -134,3 +73,5 @@ bb21: } declare float @llvm.fabs.f32(float) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; MIR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index 4d8687b141a79..d17c3dba5d9c9 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -12,9 +12,10 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9 @@ -122,9 +123,12 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v0, v2, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: .LBB0_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; GFX9-NEXT: s_cbranch_execz .LBB0_4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 @@ -146,8 +150,8 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v3, vcc -; GFX9-NEXT: .LBB0_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB0_4: ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -163,9 +167,10 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 @@ -258,9 +263,12 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: .LBB1_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; GFX9-NEXT: s_cbranch_execz .LBB1_4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 @@ -282,8 +290,8 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v3, vcc -; GFX9-NEXT: .LBB1_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB1_4: ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -299,9 +307,10 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_xor_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -407,9 +416,12 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v5, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: .LBB2_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[8:9], exec +; GFX9-NEXT: s_and_b64 s[6:7], s[8:9], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[8:9] +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 @@ -429,8 +441,8 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_sub_u32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -446,9 +458,10 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_xor_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 @@ -540,9 +553,12 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: .LBB3_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9] -; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[8:9], exec +; GFX9-NEXT: s_and_b64 s[6:7], s[8:9], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[8:9] +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 @@ -562,8 +578,8 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_sub_u32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -705,9 +721,10 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_xor_b64 s[10:11], vcc, exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9 @@ -826,9 +843,12 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v7, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX9-NEXT: .LBB8_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GFX9-NEXT: s_and_b64 s[6:7], s[10:11], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[10:11] +; GFX9-NEXT: s_cbranch_scc0 .LBB8_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 @@ -853,8 +873,8 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc -; GFX9-NEXT: .LBB8_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB8_4: ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: v_mov_b32_e32 v2, v6 @@ -876,9 +896,10 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_xor_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 @@ -978,9 +999,12 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: .LBB9_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[8:9] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_xor_b64 s[4:5], s[8:9], exec +; GFX9-NEXT: s_and_b64 s[6:7], s[8:9], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[8:9] +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 @@ -1005,8 +1029,8 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc -; GFX9-NEXT: .LBB9_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB9_4: ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: v_mov_b32_e32 v2, v6 diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll index 1f0e09371d6d5..74c1682d2e2bc 100644 --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/call-skip.ll b/llvm/test/CodeGen/AMDGPU/call-skip.ll index ea2bba1673a0b..8d7d37571789b 100644 --- a/llvm/test/CodeGen/AMDGPU/call-skip.ll +++ b/llvm/test/CodeGen/AMDGPU/call-skip.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; A call should be skipped if all lanes are zero, since we don't know diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll index fdae1696a5a49..e021dfab2ef3d 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -74,15 +74,17 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], exec +; GFX7-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB0_2 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 28, v2 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX7-NEXT: flat_load_dword v4, v[2:3] -; GFX7-NEXT: .LBB0_2: ; %endif ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB0_2: ; %endif ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x3d08fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -94,15 +96,17 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB0_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; %if ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v4, v[2:3] -; GFX8-NEXT: .LBB0_2: ; %endif ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB0_2: ; %endif ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3d08fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -114,13 +118,15 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: flat_load_dword v4, v[2:3] offset:28 -; GFX9-NEXT: .LBB0_2: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB0_2: ; %endif ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3d0000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -133,12 +139,14 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB0_2 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: flat_load_dword v4, v[2:3] offset:28 -; GFX10-NEXT: .LBB0_2: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB0_2: ; %endif ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3d0800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -228,18 +236,20 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; GFX7-LABEL: test_sink_noop_addrspacecast_flat_to_global_i32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], exec +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX7-NEXT: s_cbranch_execz .LBB1_2 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 offset:28 -; GFX7-NEXT: .LBB1_2: ; %endif ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: .LBB1_2: ; %endif ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x3d08fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -251,15 +261,17 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB1_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; %if ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v4, v[2:3] -; GFX8-NEXT: .LBB1_2: ; %endif ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB1_2: ; %endif ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3d08fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -271,13 +283,15 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:28 -; GFX9-NEXT: .LBB1_2: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB1_2: ; %endif ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3d0000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -290,12 +304,14 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB1_2 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:28 -; GFX10-NEXT: .LBB1_2: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB1_2: ; %endif ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3d0800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -341,18 +357,20 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in ; GFX7-LABEL: test_sink_noop_addrspacecast_flat_to_constant_i32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], exec +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX7-NEXT: s_cbranch_execz .LBB2_2 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 offset:28 -; GFX7-NEXT: .LBB2_2: ; %endif ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: .LBB2_2: ; %endif ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x3d08fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -364,15 +382,17 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX8-NEXT: ; %bb.1: ; %if ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v4, v[2:3] -; GFX8-NEXT: .LBB2_2: ; %endif ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB2_2: ; %endif ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3d08fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -384,13 +404,15 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:28 -; GFX9-NEXT: .LBB2_2: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB2_2: ; %endif ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3d0000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -403,12 +425,14 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB2_2 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:28 -; GFX10-NEXT: .LBB2_2: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB2_2: ; %endif ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3d0800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -509,17 +533,19 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; GFX7-LABEL: test_sink_flat_small_max_flat_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v5, -1, 0 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v4, -1, 0 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], exec +; GFX7-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB3_2 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xfff, v2 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX7-NEXT: flat_load_sbyte v4, v[2:3] -; GFX7-NEXT: .LBB3_2: ; %endif ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB3_2: ; %endif ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x1000, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -530,17 +556,19 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; GFX8-LABEL: test_sink_flat_small_max_flat_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB3_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX8-NEXT: ; %bb.1: ; %if ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xfff, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_sbyte v4, v[2:3] -; GFX8-NEXT: .LBB3_2: ; %endif ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB3_2: ; %endif ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x1000, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -551,15 +579,17 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; GFX9-LABEL: test_sink_flat_small_max_flat_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: flat_load_sbyte v4, v[2:3] offset:4095 -; GFX9-NEXT: .LBB3_2: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB3_2: ; %endif ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -571,16 +601,18 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0 +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB3_2 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: flat_load_sbyte v4, v[2:3] offset:2047 -; GFX10-NEXT: .LBB3_2: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB3_2: ; %endif ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -630,17 +662,19 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 { ; GFX7-LABEL: test_sink_flat_small_max_plus_1_flat_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v5, -1, 0 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v4, -1, 0 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX7-NEXT: s_mov_b64 s[4:5], exec +; GFX7-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB4_2 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x1000, v2 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX7-NEXT: flat_load_sbyte v4, v[2:3] -; GFX7-NEXT: .LBB4_2: ; %endif ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB4_2: ; %endif ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x61a7c, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -651,17 +685,19 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 { ; GFX8-LABEL: test_sink_flat_small_max_plus_1_flat_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB4_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; %if ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x1000, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_sbyte v4, v[2:3] -; GFX8-NEXT: .LBB4_2: ; %endif ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB4_2: ; %endif ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x61a7c, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -672,17 +708,19 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 { ; GFX9-LABEL: test_sink_flat_small_max_plus_1_flat_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x1000, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: flat_load_sbyte v4, v[2:3] -; GFX9-NEXT: .LBB4_2: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB4_2: ; %endif ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x61000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -694,16 +732,18 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v4, -1, 0 +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB4_2 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: flat_load_sbyte v4, v[2:3] -; GFX10-NEXT: .LBB4_2: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB4_2: ; %endif ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x61800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -753,17 +793,19 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 { ; GFX7-LABEL: test_sinkable_flat_reg_offset: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v7, -1, 0 +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v6, -1, 0 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX7-NEXT: s_mov_b64 s[4:5], exec +; GFX7-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX7-NEXT: v_mov_b32_e32 v6, 0 -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB5_2 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GFX7-NEXT: flat_load_sbyte v6, v[2:3] -; GFX7-NEXT: .LBB5_2: ; %endif ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB5_2: ; %endif ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x1000, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -774,17 +816,19 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 { ; GFX8-LABEL: test_sinkable_flat_reg_offset: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX8-NEXT: v_mov_b32_e32 v6, 0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB5_2 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; %if ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GFX8-NEXT: flat_load_sbyte v6, v[2:3] -; GFX8-NEXT: .LBB5_2: ; %endif ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB5_2: ; %endif ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x1000, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -795,17 +839,19 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 { ; GFX9-LABEL: test_sinkable_flat_reg_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; GFX9-NEXT: flat_load_sbyte v6, v[2:3] -; GFX9-NEXT: .LBB5_2: ; %endif ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB5_2: ; %endif ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -817,16 +863,18 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0 +; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB5_2 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX10-NEXT: flat_load_sbyte v6, v[2:3] -; GFX10-NEXT: .LBB5_2: ; %endif ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB5_2: ; %endif ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll index 1588dde19cfb7..6492582551216 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -1,5 +1,5 @@ +; XFAIL: * ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll index da609bfa8edea..b405c0b3c9966 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -28,9 +28,11 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %if ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 @@ -38,8 +40,8 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:28 ; GCN-NEXT: global_load_dword v0, v[0:1], off glc ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB0_2: ; %endif -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:2300 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index 49f9f695409b1..f5e601123ddd0 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s ; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s ; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll index bcdfb75ab1ef9..07acf07b89262 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll @@ -195,27 +195,30 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) { ; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 1, v0 ; DAGISEL-ASM-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; DAGISEL-ASM-NEXT: s_xor_b64 s[4:5], vcc, -1 +; DAGISEL-ASM-NEXT: s_and_b64 s[6:7], vcc, -1 +; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], exec ; DAGISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; DAGISEL-ASM-NEXT: s_and_saveexec_b64 s[4:5], vcc +; DAGISEL-ASM-NEXT: s_mov_b64 s[6:7], 0 +; DAGISEL-ASM-NEXT: s_cmov_b64 exec, vcc +; DAGISEL-ASM-NEXT: s_cbranch_scc0 .LBB7_2 ; DAGISEL-ASM-NEXT: ; %bb.1: ; %then ; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; DAGISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split -; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5] -; DAGISEL-ASM-NEXT: s_xor_b64 s[6:7], vcc, -1 -; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], 0 -; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base -; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 -; DAGISEL-ASM-NEXT: .LBB7_3: ; %finally +; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[8:9] +; DAGISEL-ASM-NEXT: .LBB7_2: ; %finally ; DAGISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1 -; DAGISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[6:7] -; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; DAGISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[4:5] +; DAGISEL-ASM-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base ; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 +; DAGISEL-ASM-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 +; DAGISEL-ASM-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2 ; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) -; DAGISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[4:5] -; DAGISEL-ASM-NEXT: s_cbranch_execnz .LBB7_3 -; DAGISEL-ASM-NEXT: ; %bb.4: ; %end -; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5] +; DAGISEL-ASM-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; DAGISEL-ASM-NEXT: s_cbranch_scc1 .LBB7_2 +; DAGISEL-ASM-NEXT: ; %bb.3: ; %end ; DAGISEL-ASM-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31] ; @@ -225,26 +228,29 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) { ; GISEL-ASM-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-ASM-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-ASM-NEXT: s_mov_b64 s[8:9], exec +; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0 +; GISEL-ASM-NEXT: s_and_b64 s[10:11], vcc, -1 ; GISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GISEL-ASM-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-ASM-NEXT: s_cmov_b64 exec, vcc +; GISEL-ASM-NEXT: s_cbranch_scc0 .LBB7_2 ; GISEL-ASM-NEXT: ; %bb.1: ; %then ; GISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split -; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base -; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0 -; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 -; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 -; GISEL-ASM-NEXT: .LBB7_3: ; %finally +; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-ASM-NEXT: .LBB7_2: ; %finally ; GISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1 ; GISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base +; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-ASM-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 +; GISEL-ASM-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2 ; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) -; GISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GISEL-ASM-NEXT: s_cbranch_execnz .LBB7_3 -; GISEL-ASM-NEXT: ; %bb.4: ; %end -; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-ASM-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GISEL-ASM-NEXT: s_cbranch_scc1 .LBB7_2 +; GISEL-ASM-NEXT: ; %bb.3: ; %end ; GISEL-ASM-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-ASM-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 6bc8d29b3bf7c..b1ee146b449a7 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -10,19 +10,23 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: simple_nested_if: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_3 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_and_b64 s[2:3], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_and_b64 s[8:9], vcc, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64 -; GCN-NEXT: s_and_b64 exec, exec, vcc -; GCN-NEXT: s_cbranch_execz .LBB0_3 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.inner.then ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 @@ -32,8 +36,10 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, 1 ; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:4 -; GCN-NEXT: .LBB0_3: ; %bb.outer.end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB0_3: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB0_4: ; %bb.outer.end ; GCN-NEXT: v_mov_b32_e32 v0, 3 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 m0, -1 @@ -60,17 +66,18 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 ; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB0_4 -; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB0_1 +; GCN-O0-NEXT: s_branch .LBB0_4 +; GCN-O0-NEXT: .LBB0_1: ; %bb.outer.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload @@ -93,24 +100,27 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0 ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB0_3 -; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB0_2 +; GCN-O0-NEXT: s_branch .LBB0_3 +; GCN-O0-NEXT: .LBB0_2: ; %bb.inner.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) @@ -120,31 +130,28 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: s_mov_b32 s2, 2 ; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2 -; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_mov_b32 s6, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB0_3: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB0_4: ; %bb.outer.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 @@ -177,36 +184,40 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-LABEL: uncollapsable_nested_if: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_4 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_and_b64 s[2:3], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_4 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_and_b64 s[8:9], vcc, -1 ; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_3 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_3 ; GCN-NEXT: ; %bb.2: ; %bb.inner.then ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:4 -; GCN-NEXT: .LBB1_3: ; %bb.inner.end ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB1_3: ; %bb.inner.end ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 -; GCN-NEXT: .LBB1_4: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB1_4: ; %bb.outer.end ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 3 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -234,17 +245,18 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 ; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB1_3 -; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB1_1 +; GCN-O0-NEXT: s_branch .LBB1_3 +; GCN-O0-NEXT: .LBB1_1: ; %bb.outer.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload @@ -267,24 +279,27 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0 ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB1_4 -; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB1_2 +; GCN-O0-NEXT: s_branch .LBB1_4 +; GCN-O0-NEXT: .LBB1_2: ; %bb.inner.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) @@ -294,23 +309,16 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: s_mov_b32 s2, 2 ; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2 -; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_mov_b32 s6, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB1_4 ; GCN-O0-NEXT: .LBB1_3: ; %Flow -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB1_5 ; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 @@ -318,11 +326,10 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s2, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s3, v0, 5 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) @@ -331,16 +338,18 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0 -; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_mov_b32 s6, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB1_3 ; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 @@ -381,45 +390,52 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: nested_if_if_else: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v2, v[1:2], s[0:3], 0 addr64 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GCN-NEXT: s_cbranch_execz .LBB2_5 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB2_6 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-NEXT: v_mov_b32_e32 v4, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v1 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GCN-NEXT: s_cbranch_execz .LBB2_3 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: s_xor_b64 s[2:3], vcc, exec +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], s0, v1 +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 +; GCN-NEXT: v_addc_u32_e64 v1, s[0:1], 0, v3, s[0:1] +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB2_3 ; GCN-NEXT: ; %bb.2: ; %bb.else -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mov_b32_e32 v0, 2 -; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:8 -; GCN-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: buffer_store_dword v3, v[0:1], s[8:11], 0 addr64 offset:8 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: .LBB2_3: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GCN-NEXT: s_cbranch_execz .LBB2_5 +; GCN-NEXT: s_xor_b64 s[0:1], s[2:3], exec +; GCN-NEXT: s_and_b64 s[6:7], s[2:3], -1 +; GCN-NEXT: s_cmov_b64 exec, s[2:3] +; GCN-NEXT: s_cbranch_scc0 .LBB2_5 ; GCN-NEXT: ; %bb.4: ; %bb.then -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:4 -; GCN-NEXT: .LBB2_5: ; %bb.outer.end -; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s10 ; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, 1 +; GCN-NEXT: buffer_store_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-NEXT: .LBB2_5: ; %Flow7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB2_6: ; %bb.outer.end ; GCN-NEXT: v_mov_b32_e32 v0, 3 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v2, v0 @@ -435,9 +451,9 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 ; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1] @@ -463,61 +479,63 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB2_6 -; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB2_1 +; GCN-O0-NEXT: s_branch .LBB2_6 +; GCN-O0-NEXT: .LBB2_1: ; %bb.outer.then +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[2:3], exec -; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] +; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], exec ; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4 ; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB2_2 -; GCN-O0-NEXT: s_branch .LBB2_4 +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB2_4 ; GCN-O0-NEXT: .LBB2_2: ; %Flow -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 -; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] -; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 6 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 7 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB2_5 -; GCN-O0-NEXT: ; %bb.3: ; %bb.then -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB2_3 +; GCN-O0-NEXT: s_branch .LBB2_5 +; GCN-O0-NEXT: .LBB2_3: ; %bb.then +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) @@ -527,22 +545,25 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: s_mov_b32 s2, 2 ; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2 -; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_mov_b32 s6, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB2_5 ; GCN-O0-NEXT: .LBB2_4: ; %bb.else -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) @@ -551,32 +572,29 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0 -; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: s_mov_b32 s6, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB2_2 ; GCN-O0-NEXT: .LBB2_5: ; %Flow1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB2_6: ; %bb.outer.end -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 @@ -624,48 +642,54 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v3 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, exec +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 ; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[0:1] -; GCN-NEXT: s_cbranch_execz .LBB3_4 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB3_4 ; GCN-NEXT: ; %bb.1: ; %bb.outer.else +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: v_mov_b32_e32 v3, 3 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_b64 s[8:9], vcc, -1 ; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:12 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_cbranch_execz .LBB3_3 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB3_3 ; GCN-NEXT: ; %bb.2: ; %bb.inner.then2 -; GCN-NEXT: s_mov_b32 s10, 0 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s8, s10 -; GCN-NEXT: s_mov_b32 s9, s10 +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, 4 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16 +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB3_3: ; %Flow -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB3_4: ; %Flow2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_8 +; GCN-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_and_b64 s[0:1], s[4:5], -1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB3_8 ; GCN-NEXT: ; %bb.5: ; %bb.outer.then +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, 1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_and_b64 s[8:9], vcc, -1 ; GCN-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:4 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GCN-NEXT: s_cbranch_execz .LBB3_7 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB3_7 ; GCN-NEXT: ; %bb.6: ; %bb.inner.then ; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB3_7: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB3_8: ; %bb.outer.end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 3 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -719,17 +743,15 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 ; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[2:3], exec -; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] +; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], exec ; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB3_1 -; GCN-O0-NEXT: s_branch .LBB3_4 +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB3_4 ; GCN-O0-NEXT: .LBB3_1: ; %Flow2 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) @@ -738,16 +760,17 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] -; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 3 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB3_8 -; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB3_2 +; GCN-O0-NEXT: s_branch .LBB3_8 +; GCN-O0-NEXT: .LBB3_2: ; %bb.outer.then ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload @@ -767,32 +790,39 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB3_7 -; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then -; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB3_3 +; GCN-O0-NEXT: s_branch .LBB3_7 +; GCN-O0-NEXT: .LBB3_3: ; %bb.inner.then +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_mov_b32 s4, s2 -; GCN-O0-NEXT: s_mov_b32 s5, s0 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s6, 0 +; GCN-O0-NEXT: s_mov_b32 s2, s6 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b32 s4, s6 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB3_7 ; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 @@ -813,40 +843,47 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 offset:12 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 6 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 7 ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB3_6 -; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2 -; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB3_5 +; GCN-O0-NEXT: s_branch .LBB3_6 +; GCN-O0-NEXT: .LBB3_5: ; %bb.inner.then2 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s2, 0 -; GCN-O0-NEXT: s_mov_b32 s4, s2 -; GCN-O0-NEXT: s_mov_b32 s5, s0 -; GCN-O0-NEXT: s_mov_b32 s0, s2 -; GCN-O0-NEXT: s_mov_b32 s1, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: s_mov_b32 s4, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s6, 0 +; GCN-O0-NEXT: s_mov_b32 s2, s6 +; GCN-O0-NEXT: s_mov_b32 s3, s4 +; GCN-O0-NEXT: s_mov_b32 s4, s6 +; GCN-O0-NEXT: s_mov_b32 s5, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-O0-NEXT: v_mov_b32_e32 v0, 4 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:16 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB3_6: ; %Flow ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB3_1 ; GCN-O0-NEXT: .LBB3_7: ; %Flow1 @@ -855,18 +892,14 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end ; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 @@ -911,8 +944,10 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-LABEL: s_endpgm_unsafe_barrier: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %bb.then ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 @@ -921,8 +956,8 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v1, v[0:1], s[4:7], 0 addr64 -; GCN-NEXT: .LBB4_2: ; %bb.end ; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: .LBB4_2: ; %bb.end ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_barrier ; GCN-NEXT: s_endpgm @@ -937,9 +972,9 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 ; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 @@ -947,48 +982,49 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 ; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 -; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: s_mov_b64 s[2:3], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB4_2 -; GCN-O0-NEXT: ; %bb.1: ; %bb.then -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB4_1 +; GCN-O0-NEXT: s_branch .LBB4_2 +; GCN-O0-NEXT: .LBB4_1: ; %bb.then +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 -; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 -; GCN-O0-NEXT: s_mov_b32 s5, s2 -; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_mov_b32 s6, 0xf000 +; GCN-O0-NEXT: s_mov_b32 s2, 0 +; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; GCN-O0-NEXT: s_mov_b32 s3, s6 +; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_ashrrev_i32_e64 v2, 31, v0 ; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 -; GCN-O0-NEXT: s_mov_b32 s4, 2 -; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[0:1], s4 +; GCN-O0-NEXT: s_mov_b32 s2, 2 +; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[0:1], s2 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB4_2: ; %bb.end -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_barrier ; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_endpgm @@ -1020,44 +1056,51 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: s_branch .LBB5_3 ; GCN-NEXT: .LBB5_1: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-NEXT: s_or_b64 exec, exec, s[14:15] ; GCN-NEXT: .LBB5_2: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[14:15] ; GCN-NEXT: s_and_b64 s[6:7], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13] +; GCN-NEXT: s_andn2_b64 s[10:11], exec, s[12:13] +; GCN-NEXT: s_and_b64 s[6:7], s[10:11], -1 ; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: s_andn2_b64 exec, exec, s[12:13] -; GCN-NEXT: s_cbranch_execz .LBB5_7 +; GCN-NEXT: s_cselect_b64 exec, s[10:11], s[12:13] +; GCN-NEXT: s_cbranch_scc0 .LBB5_7 ; GCN-NEXT: .LBB5_3: ; %bb1 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_and_b64 s[10:11], exec, vcc ; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 +; GCN-NEXT: s_andn2_b64 s[10:11], exec, s[6:7] +; GCN-NEXT: s_and_b64 s[14:15], s[10:11], -1 +; GCN-NEXT: s_cselect_b64 exec, s[10:11], s[6:7] +; GCN-NEXT: s_cbranch_scc1 .LBB5_3 ; GCN-NEXT: ; %bb.4: ; %bb2 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_mov_b32 s10, s8 ; GCN-NEXT: s_mov_b32 s11, s8 +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec ; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_mov_b64 s[14:15], exec +; GCN-NEXT: s_and_b64 s[16:17], s[6:7], -1 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: s_and_saveexec_b64 s[14:15], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB5_2 ; GCN-NEXT: ; %bb.5: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GCN-NEXT: s_mov_b64 s[16:17], exec ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_gt_f32_e64 s[6:7], 0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_and_b64 s[18:19], s[6:7], -1 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB5_1 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB5_1 ; GCN-NEXT: ; %bb.6: ; %bb8 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 ; GCN-NEXT: s_mov_b32 s9, s8 @@ -1065,9 +1108,9 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: s_or_b64 exec, exec, s[16:17] ; GCN-NEXT: s_branch .LBB5_1 ; GCN-NEXT: .LBB5_7: ; %bb12 -; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen @@ -1087,10 +1130,10 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] @@ -1099,61 +1142,57 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1 ; GCN-O0-NEXT: v_writelane_b32 v0, s4, 2 ; GCN-O0-NEXT: v_writelane_b32 v0, s5, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: .LBB5_1: ; %bb1 ; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s8, v0, 2 ; GCN-O0-NEXT: v_readlane_b32 s9, v0, 3 -; GCN-O0-NEXT: v_readlane_b32 s6, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s7, v0, 1 -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 5 ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b32 s4, 0x207 +; GCN-O0-NEXT: s_mov_b32 s6, 0x207 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, s4 -; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 6 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 7 -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1 -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[6:7], v1, s6 +; GCN-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GCN-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB5_1 ; GCN-O0-NEXT: ; %bb.2: ; %bb2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 6 -; GCN-O0-NEXT: v_readlane_b32 s5, v0, 7 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b32 s6, 0 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v1, s6 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s6 -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 8 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 9 ; GCN-O0-NEXT: s_mov_b32 s4, 0 -; GCN-O0-NEXT: s_mov_b32 s8, s4 -; GCN-O0-NEXT: s_mov_b32 s9, s4 -; GCN-O0-NEXT: s_mov_b32 s10, s4 -; GCN-O0-NEXT: s_mov_b32 s11, s4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], v1, s4 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, s4 +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 6 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GCN-O0-NEXT: s_mov_b32 s6, 0 +; GCN-O0-NEXT: s_mov_b32 s8, s6 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_mov_b32 s10, s6 +; GCN-O0-NEXT: s_mov_b32 s11, s6 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 ; GCN-O0-NEXT: v_mov_b32_e32 v3, s10 @@ -1163,31 +1202,32 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 10 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 11 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_mov_b64 s[6:7], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 8 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 9 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execz .LBB5_5 -; GCN-O0-NEXT: ; %bb.3: ; %bb4 +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB5_3 +; GCN-O0-NEXT: s_branch .LBB5_5 +; GCN-O0-NEXT: .LBB5_3: ; %bb4 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 ; GCN-O0-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen -; GCN-O0-NEXT: s_mov_b32 s4, 0 +; GCN-O0-NEXT: s_mov_b32 s6, 0 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v1, s4 -; GCN-O0-NEXT: s_mov_b32 s8, s4 -; GCN-O0-NEXT: s_mov_b32 s9, s4 -; GCN-O0-NEXT: s_mov_b32 s10, s4 -; GCN-O0-NEXT: s_mov_b32 s11, s4 +; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, s6 +; GCN-O0-NEXT: s_mov_b32 s8, s6 +; GCN-O0-NEXT: s_mov_b32 s9, s6 +; GCN-O0-NEXT: s_mov_b32 s10, s6 +; GCN-O0-NEXT: s_mov_b32 s11, s6 ; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 ; GCN-O0-NEXT: v_mov_b32_e32 v3, s10 @@ -1197,49 +1237,49 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 12 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 13 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_mov_b64 s[6:7], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 10 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 11 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execz .LBB5_6 -; GCN-O0-NEXT: ; %bb.4: ; %bb8 +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB5_4 +; GCN-O0-NEXT: s_branch .LBB5_6 +; GCN-O0-NEXT: .LBB5_4: ; %bb8 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_mov_b32 s10, 0 -; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; implicit-def: $sgpr9 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; implicit-def: $sgpr8 -; GCN-O0-NEXT: ; implicit-def: $sgpr5 -; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 -; GCN-O0-NEXT: s_mov_b32 s5, s10 -; GCN-O0-NEXT: s_mov_b32 s6, s9 -; GCN-O0-NEXT: s_mov_b32 s7, s8 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 10 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 11 +; GCN-O0-NEXT: s_mov_b32 s12, 0 +; GCN-O0-NEXT: ; implicit-def: $sgpr8 +; GCN-O0-NEXT: ; implicit-def: $sgpr6 +; GCN-O0-NEXT: ; implicit-def: $sgpr7 +; GCN-O0-NEXT: ; implicit-def: $sgpr6 +; GCN-O0-NEXT: ; implicit-def: $sgpr6 +; GCN-O0-NEXT: ; implicit-def: $sgpr9 +; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; GCN-O0-NEXT: s_mov_b32 s9, s12 +; GCN-O0-NEXT: s_mov_b32 s10, s7 +; GCN-O0-NEXT: s_mov_b32 s11, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v0, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s11 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_branch .LBB5_6 ; GCN-O0-NEXT: .LBB5_5: ; %Flow2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v4, 10 -; GCN-O0-NEXT: v_readlane_b32 s5, v4, 11 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -1254,14 +1294,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_branch .LBB5_7 ; GCN-O0-NEXT: .LBB5_6: ; %Flow ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v4, 12 -; GCN-O0-NEXT: v_readlane_b32 s5, v4, 13 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: v_readlane_b32 s4, v4, 8 +; GCN-O0-NEXT: v_readlane_b32 s5, v4, 9 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload @@ -1273,95 +1312,89 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_branch .LBB5_5 ; GCN-O0-NEXT: .LBB5_7: ; %bb10 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(3) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s6, v0, 8 -; GCN-O0-NEXT: v_readlane_b32 s7, v0, 9 -; GCN-O0-NEXT: s_mov_b64 s[4:5], -1 -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 14 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 15 -; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 16 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 17 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 6 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 7 +; GCN-O0-NEXT: s_mov_b64 s[6:7], -1 +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 12 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 13 +; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-O0-NEXT: s_mov_b64 s[6:7], exec +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 14 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 15 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execz .LBB5_9 -; GCN-O0-NEXT: ; %bb.8: ; %Flow1 +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB5_8 +; GCN-O0-NEXT: s_branch .LBB5_9 +; GCN-O0-NEXT: .LBB5_8: ; %Flow1 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 -; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1 +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 14 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 15 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 14 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 15 +; GCN-O0-NEXT: s_mov_b64 s[6:7], 0 +; GCN-O0-NEXT: s_xor_b64 s[6:7], exec, -1 +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 12 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 13 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: .LBB5_9: ; %Flow3 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s8, v4, 16 -; GCN-O0-NEXT: v_readlane_b32 s9, v4, 17 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-O0-NEXT: v_readlane_b32 s6, v4, 4 ; GCN-O0-NEXT: v_readlane_b32 s7, v4, 5 -; GCN-O0-NEXT: v_readlane_b32 s4, v4, 14 -; GCN-O0-NEXT: v_readlane_b32 s5, v4, 15 +; GCN-O0-NEXT: v_readlane_b32 s4, v4, 12 +; GCN-O0-NEXT: v_readlane_b32 s5, v4, 13 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5] -; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-O0-NEXT: s_mov_b64 s[6:7], 0 -; GCN-O0-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] +; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 +; GCN-O0-NEXT: s_mov_b64 s[8:9], s[6:7] ; GCN-O0-NEXT: v_writelane_b32 v4, s8, 0 ; GCN-O0-NEXT: v_writelane_b32 v4, s9, 1 -; GCN-O0-NEXT: v_writelane_b32 v4, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v4, s7, 3 -; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v4, s6, 18 -; GCN-O0-NEXT: v_writelane_b32 v4, s7, 19 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: v_writelane_b32 v4, s4, 2 +; GCN-O0-NEXT: v_writelane_b32 v4, s5, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: s_waitcnt vmcnt(1) ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 +; GCN-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GCN-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GCN-O0-NEXT: s_cbranch_scc1 .LBB5_1 ; GCN-O0-NEXT: ; %bb.10: ; %bb12 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(3) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 18 -; GCN-O0-NEXT: v_readlane_b32 s5, v0, 19 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: ; %bb.11: ; %bb12 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_mov_b64 exec, s[16:17] ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir index 48ca53732ed06..c45fe79d6f549 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir @@ -12,24 +12,35 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: DBG_VALUE + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: DBG_VALUE ; GCN-NEXT: S_ENDPGM 0 bb.0: @@ -43,14 +54,14 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec DBG_VALUE + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: DBG_VALUE - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -66,27 +77,37 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.5(0x40000000), %bb.1(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -97,14 +118,14 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -120,29 +141,38 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: DBG_VALUE + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -155,15 +185,15 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: DBG_VALUE + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -179,31 +209,37 @@ body: | ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]] ; GCN-NEXT: KILL [[DEF]] + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -217,15 +253,15 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: %4:sgpr_32 = IMPLICIT_DEF %5:sgpr_32 = S_BREV_B32 %4 KILL %4 - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -242,22 +278,28 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} @@ -265,9 +307,9 @@ body: | ; GCN-NEXT: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]] ; GCN-NEXT: KILL [[DEF]] ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[S_BREV_B32_]] + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -280,16 +322,16 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %4:sgpr_32 = IMPLICIT_DEF %5:sgpr_32 = S_BREV_B32 %4 KILL %4 %6:sgpr_32 = COPY %5 + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -305,30 +347,35 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc ; GCN-NEXT: [[S_BREV_B64_:%[0-9]+]]:sreg_64 = S_BREV_B64 $exec + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -341,13 +388,13 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %4:sreg_64 = S_BREV_B64 $exec + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -363,31 +410,36 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 undef %4:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub2 + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -401,13 +453,13 @@ body: | %3:sreg_64 = SI_IF undef %4:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %3:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %3:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %5:vgpr_32 = COPY %2.sub2 + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -422,31 +474,40 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_BRANCH %bb.5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.4 bb.0: successors: %bb.1, %bb.4 @@ -459,16 +520,16 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.5 bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 bb.5: + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.4 ... @@ -494,7 +555,7 @@ body: | ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.1(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[V_CMP_LT_U32_e64_]], implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[V_CMP_LT_U32_e64_]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.1 bb.0: successors: %bb.1 @@ -506,12 +567,12 @@ body: | bb.1: successors: %bb.1 - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.1 ... -# Both s_or_b64 shall be preserved since the outer SI_END_CF belongs to SI_ELSE. +# Both s_or_b64 shall be preserved since the outer SI_WAVE_RECONVERGE belongs to SI_ELSE. --- name: simple_outer_if_else @@ -523,43 +584,49 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], [[COPY]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], $exec, implicit-def $scc + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[S_XOR_B64_]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.6(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[S_XOR_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc - ; GCN-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_B64_1]], implicit-def $scc - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec + ; GCN-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_XOR_B64_]], $exec, implicit-def $scc + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_XOR_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_XOR_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.6 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %4:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_2]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 undef %4:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_4:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_3]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_3]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.4, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.6(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[S_XOR_B64_1]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.6: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[S_AND_B64_1]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -568,6 +635,7 @@ body: | bb.1: successors: %bb.2 + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.2 bb.2: @@ -581,14 +649,14 @@ body: | bb.4: successors: %bb.5 + SI_WAVE_RECONVERGE %3:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: successors: %bb.6 - SI_END_CF %3:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.6: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -608,38 +676,39 @@ body: | ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.6(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.6 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64_term $exec, %2, implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, %2, implicit-def $scc - ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.6(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.6: ; GCN-NEXT: successors: %bb.4(0x40000000), %bb.0(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_1]], [[COPY1]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.0 + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_2]], $exec, implicit-def $scc + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.0, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: S_ENDPGM 0 bb.0: S_BRANCH %bb.6 @@ -648,13 +717,13 @@ body: | %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.6 bb.3: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: @@ -678,27 +747,36 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc - ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: S_ENDPGM 0 bb.0: @@ -712,13 +790,13 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: S_ENDPGM 0 @@ -740,20 +818,29 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 undef %3:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_BRANCH %bb.5 @@ -764,11 +851,10 @@ body: | ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.6(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc - ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.6: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.4 bb.0: successors: %bb.1, %bb.4 @@ -781,9 +867,9 @@ body: | %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.5 bb.4: @@ -791,9 +877,9 @@ body: | bb.5: - SI_END_CF %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.6: + SI_WAVE_RECONVERGE %0:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.4 ... @@ -815,54 +901,66 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, killed [[DEF]], implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.14 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.14(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, killed [[DEF1]], implicit $exec - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], killed [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_1]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_1]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.6 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.7(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CMP_EQ_U32_e64_2:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, killed [[DEF2]], implicit $exec - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], killed [[V_CMP_EQ_U32_e64_2]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_2]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_2]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_2]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.7 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: - ; GCN-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) + ; GCN-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CMP_EQ_U32_e64_3:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, killed [[DEF3]], implicit $exec - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY3]], killed [[V_CMP_EQ_U32_e64_3]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_3]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.4 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_3]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_3]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.4, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: + ; GCN-NEXT: successors: %bb.5(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc + ; GCN-NEXT: S_BRANCH %bb.5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.7(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.7 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6: + ; GCN-NEXT: successors: %bb.14(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: S_BRANCH %bb.14 + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.7: ; GCN-NEXT: successors: %bb.8(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.8 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.8: @@ -875,34 +973,40 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CMP_EQ_U32_e64_4:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, killed [[DEF4]], implicit $exec - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_4:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], killed [[V_CMP_EQ_U32_e64_4]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_4]], [[COPY4]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_4]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.11 + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_EQ_U32_e64_4]], $exec, implicit-def $scc + ; GCN-NEXT: [[S_AND_B64_4:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_4]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_4]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.11, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.12 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.10: - ; GCN-NEXT: successors: %bb.14(0x80000000) + ; GCN-NEXT: successors: %bb.13(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_BRANCH %bb.14 + ; GCN-NEXT: $exec = S_OR_B64_term $exec, %15, implicit-def $scc + ; GCN-NEXT: S_BRANCH %bb.13 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.11: ; GCN-NEXT: successors: %bb.12(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[S_XOR_B64_]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.12 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.12: - ; GCN-NEXT: successors: %bb.10(0x40000000), %bb.14(0x40000000) + ; GCN-NEXT: successors: %bb.10(0x40000000), %bb.13(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_XOR_B64_]], $exec, implicit-def $scc + ; GCN-NEXT: [[S_AND_B64_5:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_XOR_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_XOR_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.10, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.13 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[S_XOR_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: [[S_AND_B64_5:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc - ; GCN-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_B64_5]], implicit-def $scc - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.10 + ; GCN-NEXT: bb.13: + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: S_BRANCH %bb.6 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.14: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.14 @@ -939,24 +1043,24 @@ body: | bb.4: successors: %bb.5 + SI_WAVE_RECONVERGE %11:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.5 bb.5: successors: %bb.7 - SI_END_CF %11:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %8:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.7 bb.6: successors: %bb.14 - SI_END_CF %5:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.14 bb.7: successors: %bb.8 - SI_END_CF %8:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.8 bb.8: @@ -975,11 +1079,13 @@ body: | bb.10: successors: %bb.13 + SI_WAVE_RECONVERGE %15:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.13 bb.11: successors: %bb.12 + SI_WAVE_RECONVERGE %14:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.12 bb.12: @@ -991,12 +1097,11 @@ body: | bb.13: successors: %bb.6 - SI_END_CF %15:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %5:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.6 bb.14: - SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index 3db2b6ed9ab4b..07eed859ad16e 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -414,10 +414,10 @@ body: | %6:sreg_64 = S_MOV_B64 0 %7:sreg_64 = S_AND_B64 $exec, killed %6, implicit-def dead $scc $vcc = COPY %7 + SI_WAVE_RECONVERGE %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: liveins: $vcc - SI_END_CF %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0, implicit $vcc ... diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 789150f690d52..18d72b8ae2a47 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -O0 -mtriple=amdgcn--amdhsa -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VMEM -check-prefix=GCN %s ; RUN: llc -O0 -mtriple=amdgcn--amdhsa -amdgpu-spill-sgpr-to-vgpr=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VGPR -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll index 5ceea9ef63a4a..65842f9985280 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll index 238f6ab39e839..243e2fdea7855 100644 --- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll +++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s ; RUN: llc -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s ; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL diff --git a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll index bd523d4ac30b9..f883a7551a694 100644 --- a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll +++ b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll index 0d74bd39b56fe..c5813fad3c18d 100644 --- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll +++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll @@ -10,20 +10,21 @@ define i32 @test(i32 %val, i32 %cond) { ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_mov_b32 exec_lo, s4 -; GCN-NEXT: s_or_saveexec_b32 s4, -1 +; GCN-NEXT: s_mov_b32 s4, exec_lo +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_mov_b32 exec_lo, s4 -; GCN-NEXT: v_mov_b32_e32 v3, v0 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo +; GCN-NEXT: s_or_saveexec_b32 s5, -1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: s_or_saveexec_b32 s4, -1 -; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf -; GCN-NEXT: s_mov_b32 exec_lo, s4 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GCN-NEXT: s_mov_b32 exec_lo, s5 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v1, v3 +; GCN-NEXT: s_and_b32 s5, vcc_lo, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %if ; GCN-NEXT: s_or_saveexec_b32 s5, -1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 @@ -35,10 +36,10 @@ define i32 @test(i32 %val, i32 %cond) { ; GCN-NEXT: s_or_saveexec_b32 s5, -1 ; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s5 -; GCN-NEXT: v_mov_b32_e32 v5, v2 -; GCN-NEXT: ; %bb.2: ; %end +; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: v_add_nc_u32_e32 v0, v4, v5 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v4 ; GCN-NEXT: s_xor_saveexec_b32 s4, -1 ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 diff --git a/llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll b/llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll index c98da81264744..27e67364dbcd4 100644 --- a/llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll +++ b/llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs | FileCheck %s ; Check that the redundant immediate MOV instruction diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index fed4b9862dbfb..194a360ebc8ba 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -134,10 +134,11 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB5_1 +; CHECK-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CHECK-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; CHECK-NEXT: s_cbranch_scc1 .LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] @@ -435,10 +436,11 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB18_1 +; CHECK-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CHECK-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; CHECK-NEXT: s_cbranch_scc1 .LBB18_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 @@ -472,10 +474,11 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB19_1 +; CHECK-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CHECK-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; CHECK-NEXT: s_cbranch_scc1 .LBB19_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll index 5cadb65c9c942..1092386eb90c2 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll @@ -18,11 +18,11 @@ define i32 @divergent_lshr_and_cmp(i32 %x) { ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2 ; GCN-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2.UnifiedReturnBlock: ; GCN-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[V_LSHLREV_B32_e64_]], %bb.1 - ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = COPY [[PHI]] ; GCN-NEXT: SI_RETURN implicit $vgpr0 entry: diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll index eecc91239c728..3d32bdfa6c369 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll @@ -8,13 +8,15 @@ define void @wombat(i1 %cond, ptr addrspace(5) %addr) { ; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: s_and_b64 s[6:7], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %then ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: .LBB0_2: ; %end ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB0_2: ; %end ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_byte v2, v1, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index b2f9bf89d9ec6..c0dd9f989590b 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -67,12 +67,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX9-NEXT: v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v18, v16 +; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v18, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v19, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 @@ -82,13 +83,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX9-NEXT: v_cndmask_b32_e64 v13, v11, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v12, v10, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, 0, s[4:5] +; GFX9-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, 0, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB0_6 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB0_6 ; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 1, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc @@ -107,20 +110,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v5, v5, v12 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v7, v[8:9] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-NEXT: v_sub_u32_e32 v12, 64, v22 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v22, v[8:9] @@ -177,22 +181,23 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc ; GFX9-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v14, v22, v24 ; GFX9-NEXT: v_or_b32_e32 v15, v23, v25 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12 ; GFX9-NEXT: v_and_b32_e32 v6, 1, v30 +; GFX9-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB0_3 +; GFX9-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX9-NEXT: ; %bb.4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB0_5: ; %Flow2 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB0_5: ; %Flow2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 @@ -200,8 +205,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or3_b32 v12, v2, v4, v12 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v1 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v0 -; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: .LBB0_6: ; %udiv-end ; GFX9-NEXT: v_xor_b32_e32 v2, v17, v16 ; GFX9-NEXT: v_xor_b32_e32 v3, v19, v18 ; GFX9-NEXT: v_xor_b32_e32 v0, v6, v2 @@ -219,8 +224,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane @@ -538,32 +543,31 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-O0-NEXT: s_branch .LBB0_8 +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB0_7 +; GFX9-O0-NEXT: s_branch .LBB0_2 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 5 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -585,15 +589,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_5 -; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB0_4 +; GFX9-O0-NEXT: .LBB0_2: ; %Flow2 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -605,8 +603,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_9 -; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit +; GFX9-O0-NEXT: s_branch .LBB0_8 +; GFX9-O0-NEXT: .LBB0_3: ; %udiv-loop-exit +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 3 ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload @@ -615,13 +619,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s6, v[0:1] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] -; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s6, v[9:10] +; GFX9-O0-NEXT: s_mov_b32 s6, 63 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s6, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 @@ -645,15 +649,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_3 -; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB0_2 +; GFX9-O0-NEXT: .LBB0_4: ; %Flow1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -675,15 +673,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_4 -; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while +; GFX9-O0-NEXT: s_branch .LBB0_3 +; GFX9-O0-NEXT: .LBB0_5: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 6 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 7 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -844,7 +842,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill @@ -865,12 +863,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -898,10 +893,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 +; GFX9-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX9-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_1 -; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader +; GFX9-O0-NEXT: .LBB0_6: ; %udiv-preheader ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload @@ -1004,8 +1001,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -1033,8 +1030,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_6 -; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_branch .LBB0_5 +; GFX9-O0-NEXT: .LBB0_7: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -1159,18 +1156,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 -; GFX9-O0-NEXT: s_branch .LBB0_7 -; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB0_6 +; GFX9-O0-NEXT: s_branch .LBB0_4 +; GFX9-O0-NEXT: .LBB0_8: ; %udiv-end ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -1226,8 +1222,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -1239,9 +1237,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_ashrrev_i32_e32 v16, 31, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v0, v16, v0 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v16, v1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v16 +; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v16 ; GFX9-G-NEXT: v_xor_b32_e32 v2, v16, v2 -; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v16, vcc ; GFX9-G-NEXT: v_ashrrev_i32_e32 v17, 31, v7 ; GFX9-G-NEXT: v_xor_b32_e32 v3, v16, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v16, vcc @@ -1257,8 +1255,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4 ; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5 ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v0, v10, v12 -; GFX9-G-NEXT: v_or_b32_e32 v1, v11, v13 +; GFX9-G-NEXT: v_or_b32_e32 v0, v8, v12 +; GFX9-G-NEXT: v_or_b32_e32 v1, v9, v13 ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18 ; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19 @@ -1270,9 +1268,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[4:5] ; GFX9-G-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v8 ; GFX9-G-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7] -; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v11 +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v9 ; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 ; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v12 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 @@ -1295,60 +1293,65 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v15, v1, v3 ; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[6:7] ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] -; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: s_mov_b64 s[12:13], exec ; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[6:7] ; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GFX9-G-NEXT: v_or_b32_e32 v20, v7, v6 +; GFX9-G-NEXT: v_or_b32_e32 v11, v7, v6 ; GFX9-G-NEXT: v_xor_b32_e32 v6, 0x7f, v0 ; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2 -; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v11 +; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v14 +; GFX9-G-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX9-G-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 +; GFX9-G-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v12, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v13, 0, vcc -; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14 -; GFX9-G-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-G-NEXT: s_cbranch_execz .LBB0_6 +; GFX9-G-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc +; GFX9-G-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc +; GFX9-G-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-NEXT: s_cbranch_scc0 .LBB0_6 ; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, 1, v0 ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v1, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v2, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc ; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, 0x7f, v0 -; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13] -; GFX9-G-NEXT: v_subrev_u32_e32 v9, 64, v8 -; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11] +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, 0x7f, v0 +; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v10 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v10, v[12:13] +; GFX9-G-NEXT: v_subrev_u32_e32 v11, 64, v10 +; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v10, v[8:9] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v9, v[10:11] -; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v11, v[8:9] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc -; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v1, v13, vcc +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; GFX9-G-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-G-NEXT: v_cndmask_b32_e32 v10, v0, v12, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v11, v1, v13, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: s_xor_b64 s[6:7], s[4:5], exec ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-NEXT: s_and_b64 s[14:15], s[4:5], -1 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] -; GFX9-G-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-G-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-NEXT: s_cbranch_scc0 .LBB0_5 ; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11] +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[8:9] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13] ; GFX9-G-NEXT: v_subrev_u32_e32 v24, 64, v20 ; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13] @@ -1361,27 +1364,26 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc ; GFX9-G-NEXT: v_add_co_u32_e32 v24, vcc, -1, v18 -; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20 ; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v10, s[4:5] -; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v11, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v8, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v9, s[4:5] ; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc -; GFX9-G-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-G-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v7 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v8, 31, v7 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[12:13] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v11 ; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] ; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 ; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v13 @@ -1403,36 +1405,37 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc -; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22 ; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23 ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX9-G-NEXT: v_and_b32_e32 v10, 1, v28 -; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 -; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3 +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v8 +; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v28 +; GFX9-G-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-G-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GFX9-G-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-G-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; GFX9-G-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX9-G-NEXT: ; %bb.4: ; %Flow -; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-G-NEXT: .LBB0_5: ; %Flow2 -; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] ; GFX9-G-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v4 +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v4 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 -; GFX9-G-NEXT: .LBB0_6: ; %Flow3 -; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX9-G-NEXT: .LBB0_6: ; %udiv-end ; GFX9-G-NEXT: v_xor_b32_e32 v3, v17, v16 ; GFX9-G-NEXT: v_xor_b32_e32 v0, v6, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v7, v3 ; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 -; GFX9-G-NEXT: v_xor_b32_e32 v2, v8, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v10, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-G-NEXT: v_xor_b32_e32 v4, v9, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v4, v11, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc ; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-G-NEXT: s_setpc_b64 s[30:31] @@ -1442,10 +1445,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 @@ -1728,31 +1730,30 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5 ; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1 -; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0 -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1 +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 0 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 1 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-G-O0-NEXT: s_branch .LBB0_8 +; GFX9-G-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-G-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_scc1 .LBB0_7 +; GFX9-G-O0-NEXT: s_branch .LBB0_2 ; GFX9-G-O0-NEXT: .LBB0_1: ; %Flow ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3 -; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 2 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 3 ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -1772,15 +1773,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB0_5 -; GFX9-G-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_branch .LBB0_4 +; GFX9-G-O0-NEXT: .LBB0_2: ; %Flow2 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -1791,8 +1786,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB0_9 -; GFX9-G-O0-NEXT: .LBB0_4: ; %udiv-loop-exit +; GFX9-G-O0-NEXT: s_branch .LBB0_8 +; GFX9-G-O0-NEXT: .LBB0_3: ; %udiv-loop-exit +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload @@ -1806,18 +1807,18 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5] ; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec ; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec -; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3 -; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 @@ -1846,15 +1847,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB0_3 -; GFX9-G-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_branch .LBB0_2 +; GFX9-G-O0-NEXT: .LBB0_4: ; %Flow1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload @@ -1874,15 +1869,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB0_4 -; GFX9-G-O0-NEXT: .LBB0_6: ; %udiv-do-while +; GFX9-G-O0-NEXT: s_branch .LBB0_3 +; GFX9-G-O0-NEXT: .LBB0_5: ; %udiv-do-while ; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 -; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 5 ; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload @@ -2051,7 +2046,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4 ; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20] -; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1 @@ -2070,12 +2065,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2 -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3 -; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6 -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s4, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s5, 5 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] @@ -2099,10 +2091,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB0_6 +; GFX9-G-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-G-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX9-G-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_cbranch_scc1 .LBB0_5 ; GFX9-G-O0-NEXT: s_branch .LBB0_1 -; GFX9-G-O0-NEXT: .LBB0_7: ; %udiv-preheader +; GFX9-G-O0-NEXT: .LBB0_6: ; %udiv-preheader ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -2192,8 +2186,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] -; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6 -; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 5 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] @@ -2221,8 +2215,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB0_6 -; GFX9-G-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: s_branch .LBB0_5 +; GFX9-G-O0-NEXT: .LBB0_7: ; %udiv-bb1 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] @@ -2333,18 +2327,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec -; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 2 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 3 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_5 -; GFX9-G-O0-NEXT: s_branch .LBB0_7 -; GFX9-G-O0-NEXT: .LBB0_9: ; %udiv-end +; GFX9-G-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-G-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_scc1 .LBB0_6 +; GFX9-G-O0-NEXT: s_branch .LBB0_4 +; GFX9-G-O0-NEXT: .LBB0_8: ; %udiv-end ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] @@ -2377,10 +2370,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] @@ -2435,6 +2427,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v10, v13, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc @@ -2445,13 +2438,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5] +; GFX9-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v0, 0, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_6 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB1_6 ; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 1, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v13, vcc @@ -2470,20 +2465,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v10, v10, v13 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15 -; GFX9-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-NEXT: v_sub_u32_e32 v14, 64, v18 ; GFX9-NEXT: v_lshrrev_b64 v[12:13], v18, v[0:1] @@ -2546,16 +2542,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v17, v19, v21 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GFX9-NEXT: v_and_b32_e32 v12, 1, v26 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX9-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX9-NEXT: ; %bb.4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB1_5: ; %Flow2 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB1_5: ; %Flow2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[10:11] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[8:9] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v11 @@ -2563,8 +2560,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or3_b32 v9, v2, v4, v14 ; GFX9-NEXT: v_or_b32_e32 v10, v13, v1 ; GFX9-NEXT: v_or_b32_e32 v11, v12, v0 -; GFX9-NEXT: .LBB1_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: .LBB1_6: ; %udiv-end ; GFX9-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-NEXT: v_mov_b32_e32 v1, v10 ; GFX9-NEXT: v_mov_b32_e32 v2, v9 @@ -2576,8 +2573,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane @@ -2809,32 +2806,31 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB1_3 -; GFX9-O0-NEXT: s_branch .LBB1_8 +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_7 +; GFX9-O0-NEXT: s_branch .LBB1_2 ; GFX9-O0-NEXT: .LBB1_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 5 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -2856,15 +2852,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_5 -; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB1_4 +; GFX9-O0-NEXT: .LBB1_2: ; %Flow2 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2876,8 +2866,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_9 -; GFX9-O0-NEXT: .LBB1_4: ; %udiv-loop-exit +; GFX9-O0-NEXT: s_branch .LBB1_8 +; GFX9-O0-NEXT: .LBB1_3: ; %udiv-loop-exit +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 3 ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload @@ -2886,13 +2882,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s6, v[0:1] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] -; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s6, v[9:10] +; GFX9-O0-NEXT: s_mov_b32 s6, 63 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s6, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 @@ -2916,15 +2912,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_3 -; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB1_2 +; GFX9-O0-NEXT: .LBB1_4: ; %Flow1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -2946,15 +2936,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_4 -; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while +; GFX9-O0-NEXT: s_branch .LBB1_3 +; GFX9-O0-NEXT: .LBB1_5: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 6 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 7 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload @@ -3115,7 +3105,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill @@ -3136,12 +3126,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -3169,10 +3156,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6 +; GFX9-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX9-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_1 -; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader +; GFX9-O0-NEXT: .LBB1_6: ; %udiv-preheader ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload @@ -3275,8 +3264,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -3304,8 +3293,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_6 -; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_branch .LBB1_5 +; GFX9-O0-NEXT: .LBB1_7: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -3430,18 +3419,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 -; GFX9-O0-NEXT: s_branch .LBB1_7 -; GFX9-O0-NEXT: .LBB1_9: ; %udiv-end +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_6 +; GFX9-O0-NEXT: s_branch .LBB1_4 +; GFX9-O0-NEXT: .LBB1_8: ; %udiv-end ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -3461,8 +3449,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -3512,26 +3502,29 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v17, v13, v15 ; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[6:7] ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] -; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: s_mov_b64 s[12:13], exec ; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7] ; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GFX9-G-NEXT: v_or_b32_e32 v18, v9, v8 +; GFX9-G-NEXT: v_or_b32_e32 v9, v9, v8 ; GFX9-G-NEXT: v_xor_b32_e32 v8, 0x7f, v12 ; GFX9-G-NEXT: v_or_b32_e32 v16, v8, v14 -; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v18 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[16:17] +; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v9 +; GFX9-G-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GFX9-G-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX9-G-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX9-G-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GFX9-G-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-G-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, 0, vcc ; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, 0, vcc ; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX9-G-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc -; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; GFX9-G-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GFX9-G-NEXT: v_or_b32_e32 v16, v18, v16 -; GFX9-G-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-G-NEXT: s_cbranch_execz .LBB1_6 +; GFX9-G-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-NEXT: s_cbranch_scc0 .LBB1_6 ; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-G-NEXT: v_add_co_u32_e32 v18, vcc, 1, v12 ; GFX9-G-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v13, vcc @@ -3549,20 +3542,22 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], v14, v[0:1] ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v13, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX9-G-NEXT: s_xor_b64 s[6:7], s[4:5], exec ; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 ; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v3, vcc +; GFX9-G-NEXT: s_and_b64 s[14:15], s[4:5], -1 ; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 ; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 -; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] -; GFX9-G-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-G-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-G-NEXT: v_sub_u32_e32 v12, 64, v18 ; GFX9-G-NEXT: v_subrev_u32_e32 v22, 64, v18 @@ -3573,7 +3568,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12 ; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v13 ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc @@ -3625,24 +3619,25 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v11, v19, v21 ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] ; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v12, v17, vcc +; GFX9-G-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX9-G-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v13, v26, vcc -; GFX9-G-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX9-G-NEXT: s_and_b64 s[10:11], s[4:5], -1 ; GFX9-G-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-G-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; GFX9-G-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX9-G-NEXT: ; %bb.4: ; %Flow -; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-G-NEXT: .LBB1_5: ; %Flow2 -; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] ; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] ; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v15 ; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v2 ; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v0 ; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v1 -; GFX9-G-NEXT: .LBB1_6: ; %Flow3 -; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX9-G-NEXT: .LBB1_6: ; %udiv-end ; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 ; GFX9-G-NEXT: v_mov_b32_e32 v2, v8 @@ -3654,10 +3649,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 @@ -3864,31 +3858,30 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5 ; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1 -; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0 -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1 +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 0 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 1 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_3 -; GFX9-G-O0-NEXT: s_branch .LBB1_8 +; GFX9-G-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-G-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_scc1 .LBB1_7 +; GFX9-G-O0-NEXT: s_branch .LBB1_2 ; GFX9-G-O0-NEXT: .LBB1_1: ; %Flow ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3 -; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 2 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 3 ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -3908,15 +3901,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB1_5 -; GFX9-G-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_branch .LBB1_4 +; GFX9-G-O0-NEXT: .LBB1_2: ; %Flow2 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -3927,8 +3914,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB1_9 -; GFX9-G-O0-NEXT: .LBB1_4: ; %udiv-loop-exit +; GFX9-G-O0-NEXT: s_branch .LBB1_8 +; GFX9-G-O0-NEXT: .LBB1_3: ; %udiv-loop-exit +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload @@ -3942,18 +3935,18 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5] ; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec ; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec -; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3 -; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 @@ -3982,15 +3975,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB1_3 -; GFX9-G-O0-NEXT: .LBB1_5: ; %Flow1 -; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4 -; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5 ; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_branch .LBB1_2 +; GFX9-G-O0-NEXT: .LBB1_4: ; %Flow1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload @@ -4010,15 +3997,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB1_4 -; GFX9-G-O0-NEXT: .LBB1_6: ; %udiv-do-while +; GFX9-G-O0-NEXT: s_branch .LBB1_3 +; GFX9-G-O0-NEXT: .LBB1_5: ; %udiv-do-while ; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 -; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 5 ; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload @@ -4187,7 +4174,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4 ; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20] -; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1 @@ -4206,12 +4193,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2 -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3 -; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6 -; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s4, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s5, 5 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -4235,10 +4219,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB1_6 +; GFX9-G-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-G-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX9-G-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX9-G-O0-NEXT: s_branch .LBB1_1 -; GFX9-G-O0-NEXT: .LBB1_7: ; %udiv-preheader +; GFX9-G-O0-NEXT: .LBB1_6: ; %udiv-preheader ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload @@ -4328,8 +4314,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] -; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6 -; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 5 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -4357,8 +4343,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_branch .LBB1_6 -; GFX9-G-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: s_branch .LBB1_5 +; GFX9-G-O0-NEXT: .LBB1_7: ; %udiv-bb1 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -4469,18 +4455,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec -; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 2 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 3 ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_5 -; GFX9-G-O0-NEXT: s_branch .LBB1_7 -; GFX9-G-O0-NEXT: .LBB1_9: ; %udiv-end +; GFX9-G-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-G-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_scc1 .LBB1_6 +; GFX9-G-O0-NEXT: s_branch .LBB1_4 +; GFX9-G-O0-NEXT: .LBB1_8: ; %udiv-end ; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -4497,10 +4482,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_nop 0 -; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 16a03badcb132..a7d6a9fee5e64 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -6,140 +6,144 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-LABEL: v_sdiv_v2i128_vv: ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b64 s[10:11], exec ; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3 ; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[12:13], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v26, v24 ; SDAG-NEXT: v_mov_b32_e32 v27, v25 ; SDAG-NEXT: v_xor_b32_e32 v17, v24, v3 ; SDAG-NEXT: v_xor_b32_e32 v18, v24, v2 ; SDAG-NEXT: v_xor_b32_e32 v1, v24, v1 ; SDAG-NEXT: v_xor_b32_e32 v0, v24, v0 -; SDAG-NEXT: v_xor_b32_e32 v19, v25, v11 -; SDAG-NEXT: v_xor_b32_e32 v20, v25, v10 -; SDAG-NEXT: v_xor_b32_e32 v9, v25, v9 -; SDAG-NEXT: v_xor_b32_e32 v8, v25, v8 +; SDAG-NEXT: v_xor_b32_e32 v11, v25, v11 +; SDAG-NEXT: v_xor_b32_e32 v10, v25, v10 +; SDAG-NEXT: v_xor_b32_e32 v19, v25, v9 +; SDAG-NEXT: v_xor_b32_e32 v20, v25, v8 ; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v24 ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v1, v24, vcc ; SDAG-NEXT: v_ffbh_u32_e32 v0, v2 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v18, v24, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v18, v24, vcc ; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 32, v0 ; SDAG-NEXT: v_ffbh_u32_e32 v18, v3 -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v17, v24, vcc -; SDAG-NEXT: v_or_b32_e32 v0, v2, v10 -; SDAG-NEXT: v_ffbh_u32_e32 v17, v10 +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v17, v24, vcc +; SDAG-NEXT: v_or_b32_e32 v0, v2, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v17, v8 ; SDAG-NEXT: v_min_u32_e32 v18, v1, v18 -; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v8, v25 -; SDAG-NEXT: v_or_b32_e32 v1, v3, v11 -; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], 32, v17 -; SDAG-NEXT: v_ffbh_u32_e32 v17, v11 +; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v20, v25 +; SDAG-NEXT: v_or_b32_e32 v1, v3, v9 +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], 32, v17 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v9 ; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 64, v18 ; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v9, v25, vcc +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v19, v25, vcc ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; SDAG-NEXT: v_ffbh_u32_e32 v1, v28 -; SDAG-NEXT: v_min_u32_e32 v8, v8, v17 -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v20, v25, vcc -; SDAG-NEXT: v_add_i32_e64 v9, s[8:9], 32, v1 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v29 -; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v8, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v19, v25, vcc -; SDAG-NEXT: v_or_b32_e32 v8, v28, v0 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v0 -; SDAG-NEXT: v_min_u32_e32 v20, v9, v20 -; SDAG-NEXT: v_or_b32_e32 v9, v29, v1 -; SDAG-NEXT: v_add_i32_e32 v19, vcc, 32, v19 +; SDAG-NEXT: v_min_u32_e32 v17, v17, v20 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v10, v25, vcc +; SDAG-NEXT: v_add_i32_e64 v20, s[8:9], 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v29 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v18, v17, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v11, v25, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v28, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v0 +; SDAG-NEXT: v_min_u32_e32 v20, v20, v21 +; SDAG-NEXT: v_or_b32_e32 v11, v29, v1 +; SDAG-NEXT: v_add_i32_e32 v18, vcc, 32, v18 ; SDAG-NEXT: v_ffbh_u32_e32 v21, v1 ; SDAG-NEXT: v_add_i32_e32 v20, vcc, 64, v20 ; SDAG-NEXT: v_addc_u32_e64 v22, s[6:7], 0, 0, vcc -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_min_u32_e32 v8, v19, v21 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_min_u32_e32 v10, v18, v21 ; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v9, v22, 0, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v22, 0, s[6:7] ; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[6:7] -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v18 -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v17, 0x7f, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[6:7] +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v19, vcc +; SDAG-NEXT: v_xor_b32_e32 v17, 0x7f, v10 ; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v16, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[12:13], v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v16, vcc ; SDAG-NEXT: v_or_b32_e32 v16, v17, v18 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v9, v19 +; SDAG-NEXT: v_or_b32_e32 v17, v11, v19 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] ; SDAG-NEXT: v_and_b32_e32 v16, 1, v20 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v11, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v9, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v10, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v8, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v21, v3, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_6 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v8 -; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v10 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v9, vcc +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v11, vcc ; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v20 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v18, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v19, vcc ; SDAG-NEXT: v_or_b32_e32 v18, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v8 +; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v10 ; SDAG-NEXT: v_or_b32_e32 v19, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[10:11], v34 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[8:9], v34 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v34 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_lshr_b64 v[18:19], v[2:3], v35 -; SDAG-NEXT: v_or_b32_e32 v9, v9, v19 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 +; SDAG-NEXT: v_or_b32_e32 v11, v11, v19 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v8, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_5 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 ; SDAG-NEXT: v_lshr_b64 v[16:17], v[2:3], v30 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 -; SDAG-NEXT: v_lshr_b64 v[37:38], v[10:11], v30 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[8:9], v30 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v28 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0 ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_lshl_b64 v[48:49], v[10:11], v35 -; SDAG-NEXT: v_lshr_b64 v[10:11], v[10:11], v36 +; SDAG-NEXT: v_lshl_b64 v[48:49], v[8:9], v35 +; SDAG-NEXT: v_lshr_b64 v[8:9], v[8:9], v36 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v29, vcc ; SDAG-NEXT: v_or_b32_e32 v17, v17, v49 ; SDAG-NEXT: v_or_b32_e32 v16, v16, v48 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v11, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v10, v16, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v38, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v37, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v9, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v8, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v38, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v37, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 ; SDAG-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc @@ -147,22 +151,22 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mov_b32_e32 v17, 0 ; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v3 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v9 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v11 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v16 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v16 ; SDAG-NEXT: v_or_b32_e32 v2, v2, v38 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v39 -; SDAG-NEXT: v_or_b32_e32 v9, v19, v9 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v39 +; SDAG-NEXT: v_or_b32_e32 v11, v19, v11 ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v34, v2 -; SDAG-NEXT: v_or_b32_e32 v8, v18, v8 +; SDAG-NEXT: v_or_b32_e32 v10, v18, v10 ; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v35, v3, vcc -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v10, vcc -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v11, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v8, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v9, vcc ; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v16 ; SDAG-NEXT: v_and_b32_e32 v39, v38, v28 ; SDAG-NEXT: v_and_b32_e32 v48, v38, v29 @@ -171,8 +175,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_and_b32_e32 v38, v38, v1 ; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v39 ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v10, v49, vcc -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v38, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v8, v49, vcc +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v38, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc @@ -181,17 +185,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] ; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 -; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] +; SDAG-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 ; SDAG-NEXT: v_mov_b32_e32 v23, v17 ; SDAG-NEXT: v_mov_b32_e32 v22, v16 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB0_3 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; SDAG-NEXT: s_cbranch_scc1 .LBB0_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB0_5: ; %Flow14 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[10:11], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 @@ -199,12 +204,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v21, v17, v3 ; SDAG-NEXT: v_or_b32_e32 v17, v18, v0 ; SDAG-NEXT: v_or_b32_e32 v16, v16, v2 -; SDAG-NEXT: .LBB0_6: ; %Flow16 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB0_6: ; %udiv-end1 +; SDAG-NEXT: s_mov_b64 s[10:11], exec ; SDAG-NEXT: v_ashrrev_i32_e32 v18, 31, v7 ; SDAG-NEXT: v_ashrrev_i32_e32 v19, 31, v15 ; SDAG-NEXT: v_mov_b32_e32 v9, 0 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[12:13], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v22, v18 ; SDAG-NEXT: v_mov_b32_e32 v23, v19 ; SDAG-NEXT: v_xor_b32_e32 v0, v18, v7 @@ -260,7 +266,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v10, vcc ; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v6 ; SDAG-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v9, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[12:13], v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc ; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 @@ -277,10 +283,12 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_12 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6 ; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6 @@ -300,26 +308,27 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[6:7], v[2:3], v6 ; SDAG-NEXT: v_or_b32_e32 v7, v15, v7 ; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v9 ; SDAG-NEXT: v_cndmask_b32_e64 v8, v13, v7, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v35, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v34, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 ; SDAG-NEXT: v_cndmask_b32_e64 v9, v8, v5, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_11 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader ; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v30 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 ; SDAG-NEXT: v_lshr_b64 v[37:38], v[4:5], v30 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v28 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0 ; SDAG-NEXT: v_mov_b32_e32 v14, 0 ; SDAG-NEXT: v_mov_b32_e32 v15, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, 0 @@ -376,16 +385,17 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 ; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] -; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] +; SDAG-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 ; SDAG-NEXT: v_mov_b32_e32 v15, v11 ; SDAG-NEXT: v_mov_b32_e32 v14, v10 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB0_9 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; SDAG-NEXT: s_cbranch_scc1 .LBB0_9 ; SDAG-NEXT: ; %bb.10: ; %Flow -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB0_11: ; %Flow11 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v7 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], 1 @@ -394,8 +404,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v14, v11, v3 ; SDAG-NEXT: v_or_b32_e32 v11, v12, v0 ; SDAG-NEXT: v_or_b32_e32 v10, v10, v2 -; SDAG-NEXT: .LBB0_12: ; %Flow12 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB0_12: ; %udiv-end ; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26 ; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24 ; SDAG-NEXT: v_xor_b32_e32 v7, v23, v22 @@ -421,6 +431,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_sdiv_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v24, 31, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v25, 31, v11 @@ -492,14 +503,16 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v8, v9, v8 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, v16, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v22, 1, v8 +; GISEL-NEXT: v_and_b32_e32 v9, 1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v21, v17, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, v18, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v9, v19, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB0_6 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v0 ; GISEL-NEXT: v_addc_u32_e64 v29, s[4:5], 0, v1, vcc @@ -518,19 +531,21 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v8, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v9, v3 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 ; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v18, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v19, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB0_5 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 ; GISEL-NEXT: v_subrev_i32_e32 v34, vcc, 64, v28 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28 @@ -590,66 +605,68 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v16, 1, v0 ; GISEL-NEXT: v_and_b32_e32 v36, v0, v10 ; GISEL-NEXT: v_and_b32_e32 v0, v0, v11 +; GISEL-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v3, v1 ; GISEL-NEXT: v_subb_u32_e32 v23, vcc, v37, v18, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v36, vcc ; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v19, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v16 ; GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GISEL-NEXT: s_cbranch_execnz .LBB0_3 +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; GISEL-NEXT: s_cbranch_scc1 .LBB0_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: .LBB0_5: ; %Flow14 ; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] +; GISEL-NEXT: .LBB0_5: ; %Flow14 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 ; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v21 ; GISEL-NEXT: v_or_b32_e32 v8, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 -; GISEL-NEXT: .LBB0_6: ; %Flow16 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB0_6: ; %udiv-end1 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v15 -; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f -; GISEL-NEXT: v_mov_b32_e32 v11, 0 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v17, 0 ; GISEL-NEXT: v_xor_b32_e32 v0, v18, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v18, v5 ; GISEL-NEXT: v_xor_b32_e32 v2, v18, v6 ; GISEL-NEXT: v_xor_b32_e32 v3, v18, v7 ; GISEL-NEXT: v_xor_b32_e32 v4, v19, v12 ; GISEL-NEXT: v_xor_b32_e32 v5, v19, v13 -; GISEL-NEXT: v_xor_b32_e32 v14, v19, v14 -; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15 +; GISEL-NEXT: v_xor_b32_e32 v12, v19, v14 +; GISEL-NEXT: v_xor_b32_e32 v13, v19, v15 ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18 ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc ; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], v4, v19 ; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], v5, v19, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc -; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v14, v23 -; GISEL-NEXT: v_ffbh_u32_e32 v15, v22 -; GISEL-NEXT: v_ffbh_u32_e32 v16, v7 -; GISEL-NEXT: v_ffbh_u32_e32 v17, v6 +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v2, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v3, v18, vcc +; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v12, v19, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v13, v19, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v12, v23 +; GISEL-NEXT: v_ffbh_u32_e32 v13, v22 +; GISEL-NEXT: v_ffbh_u32_e32 v14, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v15, v6 ; GISEL-NEXT: v_or_b32_e32 v0, v22, v4 ; GISEL-NEXT: v_or_b32_e32 v1, v23, v5 -; GISEL-NEXT: v_or_b32_e32 v2, v6, v12 -; GISEL-NEXT: v_or_b32_e32 v3, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15 +; GISEL-NEXT: v_or_b32_e32 v2, v6, v10 +; GISEL-NEXT: v_or_b32_e32 v3, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 32, v13 ; GISEL-NEXT: v_ffbh_u32_e32 v26, v5 ; GISEL-NEXT: v_ffbh_u32_e32 v27, v4 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 32, v17 -; GISEL-NEXT: v_ffbh_u32_e32 v28, v13 -; GISEL-NEXT: v_ffbh_u32_e32 v29, v12 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v10 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] -; GISEL-NEXT: v_min_u32_e32 v0, v14, v15 +; GISEL-NEXT: v_min_u32_e32 v0, v12, v13 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v27 -; GISEL-NEXT: v_min_u32_e32 v2, v16, v17 +; GISEL-NEXT: v_min_u32_e32 v2, v14, v15 ; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v29 ; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 ; GISEL-NEXT: v_min_u32_e32 v1, v26, v1 @@ -659,36 +676,38 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v0 +; GISEL-NEXT: v_xor_b32_e32 v12, 0x7f, v0 ; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v10, v10, v2 -; GISEL-NEXT: v_or_b32_e32 v11, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v12, v12, v2 +; GISEL-NEXT: v_or_b32_e32 v13, v1, v3 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v11, v14, v15 -; GISEL-NEXT: v_and_b32_e32 v14, 1, v11 -; GISEL-NEXT: v_or_b32_e32 v10, v11, v10 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v13, v14, v15 +; GISEL-NEXT: v_and_b32_e32 v14, 1, v13 +; GISEL-NEXT: v_or_b32_e32 v12, v13, v12 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v16, 1, v10 +; GISEL-NEXT: v_and_b32_e32 v13, 1, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB0_12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 ; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v0 ; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v1, vcc @@ -696,53 +715,55 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v2, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v3, vcc ; GISEL-NEXT: v_subrev_i32_e64 v14, s[4:5], 64, v30 -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v30 +; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], 64, v30 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[6:7], v30 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], v30 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[10:11], v30 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: v_lshr_b64 v[10:11], v[6:7], v10 +; GISEL-NEXT: v_lshr_b64 v[12:13], v[6:7], v12 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v14 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 ; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v1, vcc -; GISEL-NEXT: v_or_b32_e32 v0, v10, v2 -; GISEL-NEXT: v_or_b32_e32 v1, v11, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v12, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v13, v3 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v0, v12, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, v1, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v0, v10, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v1, v11, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB0_11 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader ; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26 ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v26 -; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[10:11], v26 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26 ; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v22 ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v23, vcc -; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16 -; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v32 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 +; GISEL-NEXT: v_lshr_b64 v[10:11], v[10:11], v32 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v5, vcc ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 ; GISEL-NEXT: v_or_b32_e32 v3, v3, v17 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v2, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, v3, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v3, v7, vcc ; GISEL-NEXT: v_mov_b32_e32 v7, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 @@ -750,20 +771,20 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: .LBB0_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], 1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[10:11], 1 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v13 -; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v11 -; GISEL-NEXT: v_lshl_b64 v[12:13], v[14:15], 1 -; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v11 +; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v13 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[14:15], 1 +; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v15 ; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26 ; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc ; GISEL-NEXT: v_or_b32_e32 v16, v16, v6 ; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 -; GISEL-NEXT: v_or_b32_e32 v10, v10, v14 -; GISEL-NEXT: v_or_b32_e32 v14, v0, v12 -; GISEL-NEXT: v_or_b32_e32 v15, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v12, v12, v14 +; GISEL-NEXT: v_or_b32_e32 v14, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v15, v1, v11 ; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v2 @@ -776,30 +797,31 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v6, 1, v0 -; GISEL-NEXT: v_and_b32_e32 v12, v0, v22 -; GISEL-NEXT: v_and_b32_e32 v13, v0, v23 +; GISEL-NEXT: v_and_b32_e32 v10, v0, v22 +; GISEL-NEXT: v_and_b32_e32 v11, v0, v23 ; GISEL-NEXT: v_and_b32_e32 v34, v0, v4 ; GISEL-NEXT: v_and_b32_e32 v35, v0, v5 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v6 ; GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v2, v10 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v3, v11, vcc +; GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v34, vcc ; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB0_9 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB0_9 ; GISEL-NEXT: ; %bb.10: ; %Flow -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB0_11: ; %Flow11 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1 -; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v15 -; GISEL-NEXT: v_or_b32_e32 v10, v10, v4 +; GISEL-NEXT: v_or_b32_e32 v12, v12, v4 ; GISEL-NEXT: v_or_b32_e32 v14, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v15, v1, v3 -; GISEL-NEXT: .LBB0_12: ; %Flow12 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB0_12: ; %udiv-end ; GISEL-NEXT: v_xor_b32_e32 v3, v25, v24 ; GISEL-NEXT: v_xor_b32_e32 v7, v19, v18 ; GISEL-NEXT: v_xor_b32_e32 v0, v20, v3 @@ -808,8 +830,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3 ; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7 ; GISEL-NEXT: v_xor_b32_e32 v5, v15, v7 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v7 -; GISEL-NEXT: v_xor_b32_e32 v9, v11, v7 +; GISEL-NEXT: v_xor_b32_e32 v8, v12, v7 +; GISEL-NEXT: v_xor_b32_e32 v9, v13, v7 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v7 @@ -827,6 +849,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-LABEL: v_udiv_v2i128_vv: ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_or_b32_e32 v17, v9, v11 ; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 ; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 @@ -840,7 +863,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v26, v0 ; SDAG-NEXT: v_ffbh_u32_e32 v27, v1 ; SDAG-NEXT: v_mov_b32_e32 v28, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] ; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 @@ -862,18 +885,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v23, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v23 -; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[23:24] +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v21 +; SDAG-NEXT: v_subbrev_u32_e32 v23, vcc, 0, v28, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[21:22] ; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v26, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v16, v25 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_subbrev_u32_e32 v24, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v16, v23 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[23:24] ; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v24, v26 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_or_b32_e32 v17, v22, v24 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[23:24] ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] ; SDAG-NEXT: v_and_b32_e32 v16, 1, v18 @@ -883,44 +906,47 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SDAG-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_6 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v23 -; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v23 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_mov_b32_e32 v22, 0 -; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v24, vcc +; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v21 +; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v21 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v22, vcc ; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 -; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v25, vcc -; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v26, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v18, v28 -; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v23 -; SDAG-NEXT: v_or_b32_e32 v20, v27, v29 -; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v30 -; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v30 -; SDAG-NEXT: v_lshl_b64 v[25:26], v[0:1], v30 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20] -; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v31 -; SDAG-NEXT: v_or_b32_e32 v20, v24, v20 -; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v19, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v26, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 +; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v23, vcc +; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v24, vcc +; SDAG-NEXT: v_or_b32_e32 v22, v18, v28 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v21 +; SDAG-NEXT: v_or_b32_e32 v23, v27, v29 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v26 +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[30:31], v[0:1], v26 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23] +; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v21 +; SDAG-NEXT: v_or_b32_e32 v22, v25, v22 +; SDAG-NEXT: v_or_b32_e32 v21, v24, v21 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v22, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v31, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v30, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_5 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v18 +; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v18 ; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v18 ; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v18 ; SDAG-NEXT: v_lshr_b64 v[32:33], v[2:3], v18 @@ -928,8 +954,8 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_mov_b64 s[12:13], 0 ; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: v_mov_b32_e32 v26, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 ; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v31 @@ -937,18 +963,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v33, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v32, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v22, v22, v35 -; SDAG-NEXT: v_or_b32_e32 v21, v21, v34 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v35 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v34 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v22, v37, v22, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v37, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v36, v19, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v1, v22, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v0, v21, v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v20, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v19, v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshrrev_b32_e32 v21, 31, v24 +; SDAG-NEXT: v_lshrrev_b32_e32 v19, 31, v24 ; SDAG-NEXT: v_lshl_b64 v[23:24], v[23:24], 1 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v1 @@ -959,17 +985,17 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 ; SDAG-NEXT: v_or_b32_e32 v2, v2, v34 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v35 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v21 -; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v30, v0 -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v31, v1, vcc -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v32, v2, vcc -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v33, v3, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v21 -; SDAG-NEXT: v_and_b32_e32 v25, v21, v8 -; SDAG-NEXT: v_and_b32_e32 v26, v21, v9 -; SDAG-NEXT: v_and_b32_e32 v34, v21, v10 -; SDAG-NEXT: v_and_b32_e32 v35, v21, v11 -; SDAG-NEXT: v_and_b32_e32 v21, 1, v21 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v19 +; SDAG-NEXT: v_sub_i32_e32 v19, vcc, v30, v0 +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v31, v1, vcc +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v32, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v33, v3, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v19, 31, v19 +; SDAG-NEXT: v_and_b32_e32 v25, v19, v8 +; SDAG-NEXT: v_and_b32_e32 v26, v19, v9 +; SDAG-NEXT: v_and_b32_e32 v34, v19, v10 +; SDAG-NEXT: v_and_b32_e32 v35, v19, v11 +; SDAG-NEXT: v_and_b32_e32 v19, 1, v19 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v25 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v26, vcc ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v34, vcc @@ -981,27 +1007,29 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v25, v18, v28 ; SDAG-NEXT: v_or_b32_e32 v26, v27, v29 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] -; SDAG-NEXT: v_or_b32_e32 v17, v20, v17 +; SDAG-NEXT: v_or_b32_e32 v17, v22, v17 ; SDAG-NEXT: s_or_b64 s[12:13], vcc, s[12:13] -; SDAG-NEXT: v_or_b32_e32 v16, v19, v16 -; SDAG-NEXT: v_mov_b32_e32 v26, v22 -; SDAG-NEXT: v_mov_b32_e32 v25, v21 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[12:13] -; SDAG-NEXT: s_cbranch_execnz .LBB1_3 +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[12:13] +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_or_b32_e32 v16, v21, v16 +; SDAG-NEXT: v_mov_b32_e32 v26, v20 +; SDAG-NEXT: v_mov_b32_e32 v25, v19 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[12:13] +; SDAG-NEXT: s_cbranch_scc1 .LBB1_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] -; SDAG-NEXT: .LBB1_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB1_5: ; %Flow14 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[16:17], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v24 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[23:24], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 -; SDAG-NEXT: v_or_b32_e32 v16, v20, v1 -; SDAG-NEXT: v_or_b32_e32 v18, v22, v3 -; SDAG-NEXT: v_or_b32_e32 v17, v19, v0 -; SDAG-NEXT: v_or_b32_e32 v19, v21, v2 -; SDAG-NEXT: .LBB1_6: ; %Flow16 +; SDAG-NEXT: v_or_b32_e32 v16, v22, v1 +; SDAG-NEXT: v_or_b32_e32 v18, v20, v3 +; SDAG-NEXT: v_or_b32_e32 v17, v21, v0 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v2 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB1_6: ; %udiv-end1 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_or_b32_e32 v1, v13, v15 ; SDAG-NEXT: v_or_b32_e32 v0, v12, v14 ; SDAG-NEXT: v_or_b32_e32 v3, v5, v7 @@ -1015,7 +1043,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v22, v4 ; SDAG-NEXT: v_ffbh_u32_e32 v23, v5 ; SDAG-NEXT: v_mov_b32_e32 v24, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] ; SDAG-NEXT: v_add_i32_e64 v0, s[6:7], 32, v8 @@ -1041,7 +1069,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc ; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v0 ; SDAG-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v24, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[0:1] ; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v24, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 @@ -1058,10 +1086,12 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v10, v5, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SDAG-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_12 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v8, vcc, 1, v0 ; SDAG-NEXT: v_sub_i32_e64 v9, s[4:5], 63, v0 @@ -1081,19 +1111,20 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[0:1], v[4:5], v0 ; SDAG-NEXT: v_or_b32_e32 v1, v23, v1 ; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3 ; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v26, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 ; SDAG-NEXT: v_cndmask_b32_e64 v3, v2, v7, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v9, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: v_mov_b32_e32 v10, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_11 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader ; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v8 ; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v8 @@ -1158,15 +1189,16 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v30, v8, v24 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31] ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; SDAG-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 ; SDAG-NEXT: v_mov_b32_e32 v23, v21 ; SDAG-NEXT: v_mov_b32_e32 v22, v20 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB1_9 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; SDAG-NEXT: s_cbranch_scc1 .LBB1_9 ; SDAG-NEXT: ; %bb.10: ; %Flow -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB1_11: ; %Flow11 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 @@ -1175,8 +1207,8 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v10, v21, v1 ; SDAG-NEXT: v_or_b32_e32 v9, v9, v2 ; SDAG-NEXT: v_or_b32_e32 v11, v20, v0 -; SDAG-NEXT: .LBB1_12: ; %Flow12 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB1_12: ; %udiv-end ; SDAG-NEXT: v_mov_b32_e32 v0, v19 ; SDAG-NEXT: v_mov_b32_e32 v1, v18 ; SDAG-NEXT: v_mov_b32_e32 v2, v17 @@ -1192,6 +1224,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v16, v2 ; GISEL-NEXT: v_mov_b32_e32 v17, v3 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v2, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v3, v9, v11 @@ -1245,14 +1278,16 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v2, v3, v2 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v24, 1, v2 +; GISEL-NEXT: v_and_b32_e32 v3, 1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB1_6 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v20 ; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v21, vcc @@ -1271,19 +1306,21 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v3, vcc ; GISEL-NEXT: v_or_b32_e32 v2, v20, v18 ; GISEL-NEXT: v_or_b32_e32 v3, v21, v19 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v21, s11 ; GISEL-NEXT: v_mov_b32_e32 v20, s10 ; GISEL-NEXT: v_mov_b32_e32 v19, s9 ; GISEL-NEXT: v_mov_b32_e32 v18, s8 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB1_5 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 ; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26 ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v26 @@ -1343,27 +1380,29 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v21, v0, v10 ; GISEL-NEXT: v_and_b32_e32 v35, v0, v11 ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v20, v18 ; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v19, vcc +; GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v21, vcc ; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc ; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 ; GISEL-NEXT: v_mov_b32_e32 v19, v1 ; GISEL-NEXT: v_mov_b32_e32 v18, v0 -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB1_3 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB1_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB1_5: ; %Flow14 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[0:1], v[22:23], 1 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v8, 31, v23 ; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 ; GISEL-NEXT: v_or_b32_e32 v18, v18, v0 ; GISEL-NEXT: v_or_b32_e32 v19, v19, v1 -; GISEL-NEXT: .LBB1_6: ; %Flow16 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB1_6: ; %udiv-end1 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v0, v12, v14 ; GISEL-NEXT: v_or_b32_e32 v1, v13, v15 @@ -1417,14 +1456,16 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v8, v9, v8 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v4, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v20, 1, v8 +; GISEL-NEXT: v_and_b32_e32 v9, 1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v11, v5, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, v6, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v9, v7, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB1_12 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 ; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v1, vcc @@ -1443,19 +1484,21 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v20, v16 ; GISEL-NEXT: v_or_b32_e32 v1, v21, v17 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v23, s11 ; GISEL-NEXT: v_mov_b32_e32 v22, s10 ; GISEL-NEXT: v_mov_b32_e32 v21, s9 ; GISEL-NEXT: v_mov_b32_e32 v20, s8 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB1_11 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader ; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v8 @@ -1516,26 +1559,27 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v30, v6, v13 ; GISEL-NEXT: v_and_b32_e32 v31, v6, v14 ; GISEL-NEXT: v_and_b32_e32 v32, v6, v15 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v21, v5 ; GISEL-NEXT: v_mov_b32_e32 v20, v4 ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v22, v7 ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v23, v30, vcc +; GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v31, vcc ; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v32, vcc -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB1_9 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB1_9 ; GISEL-NEXT: ; %bb.10: ; %Flow -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB1_11: ; %Flow11 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], 1 ; GISEL-NEXT: v_lshl_b64 v[8:9], v[0:1], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v10 ; GISEL-NEXT: v_or_b32_e32 v8, v8, v0 ; GISEL-NEXT: v_or_b32_e32 v10, v20, v4 ; GISEL-NEXT: v_or_b32_e32 v11, v21, v5 -; GISEL-NEXT: .LBB1_12: ; %Flow12 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB1_12: ; %udiv-end ; GISEL-NEXT: v_mov_b32_e32 v0, v18 ; GISEL-NEXT: v_mov_b32_e32 v1, v19 ; GISEL-NEXT: v_mov_b32_e32 v4, v10 @@ -1552,10 +1596,11 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SDAG-NEXT: s_mov_b64 s[10:11], exec ; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v11 ; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[12:13], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v29, v28 ; SDAG-NEXT: v_xor_b32_e32 v18, v3, v28 ; SDAG-NEXT: v_xor_b32_e32 v19, v2, v28 @@ -1610,7 +1655,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc ; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v10 ; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[12:13], v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v17, vcc ; SDAG-NEXT: v_or_b32_e32 v16, v16, v18 @@ -1627,10 +1672,12 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v27, v3, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v33, v2, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_6 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 ; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10 ; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10 @@ -1650,26 +1697,27 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[18:19], v[2:3], v25 ; SDAG-NEXT: v_or_b32_e32 v11, v11, v19 ; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 ; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 ; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_5 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 ; SDAG-NEXT: v_lshr_b64 v[16:17], v[2:3], v32 ; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v32 ; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32 ; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32 ; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0 ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 @@ -1726,16 +1774,17 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v49, v33, v35 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[48:49] ; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 -; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] +; SDAG-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 ; SDAG-NEXT: v_mov_b32_e32 v23, v17 ; SDAG-NEXT: v_mov_b32_e32 v22, v16 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB2_3 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; SDAG-NEXT: s_cbranch_scc1 .LBB2_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB2_5: ; %Flow14 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 @@ -1744,12 +1793,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v27, v17, v21 ; SDAG-NEXT: v_or_b32_e32 v32, v18, v10 ; SDAG-NEXT: v_or_b32_e32 v33, v16, v20 -; SDAG-NEXT: .LBB2_6: ; %Flow16 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB2_6: ; %udiv-end1 +; SDAG-NEXT: s_mov_b64 s[10:11], exec ; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7 ; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[12:13], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v34, v26 ; SDAG-NEXT: v_xor_b32_e32 v10, v7, v26 ; SDAG-NEXT: v_xor_b32_e32 v11, v6, v26 @@ -1804,7 +1854,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v13, vcc, v13, v19, vcc ; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v12 ; SDAG-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v17, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[12:13] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[12:13], v[12:13] ; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v17, vcc ; SDAG-NEXT: v_or_b32_e32 v16, v16, v14 @@ -1821,10 +1871,12 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v7, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v16, v6, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_12 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v12 ; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v12 @@ -1844,26 +1896,27 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[12:13], v[6:7], v12 ; SDAG-NEXT: v_or_b32_e32 v13, v21, v13 ; SDAG-NEXT: v_or_b32_e32 v12, v20, v12 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v15 ; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v13, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v12, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v22, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; SDAG-NEXT: v_cndmask_b32_e64 v15, v14, v5, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v14, v18, v4, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_11 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader ; SDAG-NEXT: v_lshr_b64 v[16:17], v[6:7], v38 ; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 64, v38 ; SDAG-NEXT: v_subrev_i32_e32 v51, vcc, 64, v38 ; SDAG-NEXT: v_lshr_b64 v[22:23], v[4:5], v38 ; SDAG-NEXT: v_add_i32_e32 v50, vcc, -1, v37 -; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0 ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 @@ -1920,16 +1973,17 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v55, v39, v49 ; SDAG-NEXT: v_or_b32_e32 v54, v38, v48 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[54:55] -; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[8:9] +; SDAG-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v12, v20, v12 ; SDAG-NEXT: v_mov_b32_e32 v21, v17 ; SDAG-NEXT: v_mov_b32_e32 v20, v16 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB2_9 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; SDAG-NEXT: s_cbranch_scc1 .LBB2_9 ; SDAG-NEXT: ; %bb.10: ; %Flow -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB2_11: ; %Flow11 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v13 ; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 @@ -1938,8 +1992,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v17, v17, v13 ; SDAG-NEXT: v_or_b32_e32 v18, v18, v14 ; SDAG-NEXT: v_or_b32_e32 v16, v16, v12 -; SDAG-NEXT: .LBB2_12: ; %Flow12 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB2_12: ; %udiv-end ; SDAG-NEXT: v_mul_lo_u32 v14, v33, v9 ; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v33, v8, 0 ; SDAG-NEXT: v_mul_lo_u32 v24, v27, v8 @@ -2017,6 +2071,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_srem_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v20, 31, v11 @@ -2088,14 +2143,16 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v18, v19, v18 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v20, 1, v18 +; GISEL-NEXT: v_and_b32_e32 v19, 1, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v19 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v31, vcc, 1, v0 ; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v1, vcc @@ -2114,19 +2171,21 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v18, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v19, v3 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GISEL-NEXT: v_cndmask_b32_e32 v18, v0, v8, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v19, v1, v9, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 ; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v31 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v31 @@ -2187,26 +2246,28 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v25, v0, v29 ; GISEL-NEXT: v_and_b32_e32 v26, v0, v10 ; GISEL-NEXT: v_and_b32_e32 v0, v0, v11 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 ; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v49, v25, vcc +; GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc ; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v22 ; GISEL-NEXT: v_mov_b32_e32 v1, v23 -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB2_3 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB2_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB2_5: ; %Flow14 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v20, 31, v21 ; GISEL-NEXT: v_or_b32_e32 v18, v18, v20 ; GISEL-NEXT: v_or_b32_e32 v31, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v32, v1, v3 -; GISEL-NEXT: .LBB2_6: ; %Flow16 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB2_6: ; %udiv-end1 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v33, 31, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v15 @@ -2278,14 +2339,16 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v2, v3, v2 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, v12, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v22, 1, v2 +; GISEL-NEXT: v_and_b32_e32 v3, 1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v21, v13, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB2_12 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 ; GISEL-NEXT: v_add_i32_e32 v36, vcc, 1, v0 ; GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v1, vcc @@ -2304,19 +2367,21 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v14, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v15, v3 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GISEL-NEXT: v_cndmask_b32_e32 v14, v0, v6, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v15, v1, v7, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB2_11 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader ; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v36 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v36 @@ -2377,26 +2442,27 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v25, v0, v34 ; GISEL-NEXT: v_and_b32_e32 v26, v0, v4 ; GISEL-NEXT: v_and_b32_e32 v52, v0, v5 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 ; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v53, v25, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v22 ; GISEL-NEXT: v_mov_b32_e32 v1, v23 +; GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc ; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB2_9 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB2_9 ; GISEL-NEXT: ; %bb.10: ; %Flow -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB2_11: ; %Flow11 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], 1 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v21 ; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 ; GISEL-NEXT: v_or_b32_e32 v20, v0, v22 ; GISEL-NEXT: v_or_b32_e32 v21, v1, v23 -; GISEL-NEXT: .LBB2_12: ; %Flow12 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB2_12: ; %udiv-end ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0 ; GISEL-NEXT: v_mul_lo_u32 v24, v30, v19 @@ -2456,6 +2522,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-LABEL: v_urem_v2i128_vv: ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_or_b32_e32 v17, v9, v11 ; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 ; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 @@ -2469,7 +2536,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v26, v0 ; SDAG-NEXT: v_ffbh_u32_e32 v27, v1 ; SDAG-NEXT: v_mov_b32_e32 v28, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] ; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 @@ -2495,7 +2562,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc ; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 ; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 @@ -2512,10 +2579,12 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v30, v1, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SDAG-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_6 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 ; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v16 ; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 @@ -2535,19 +2604,20 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v27 ; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 ; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v24, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_5 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 ; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v30 ; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v30 @@ -2612,15 +2682,16 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] ; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; SDAG-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v22, v24, v22 ; SDAG-NEXT: v_mov_b32_e32 v25, v19 ; SDAG-NEXT: v_mov_b32_e32 v24, v18 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB3_3 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; SDAG-NEXT: s_cbranch_scc1 .LBB3_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB3_5: ; %Flow14 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v23 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 @@ -2629,8 +2700,9 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v30, v19, v23 ; SDAG-NEXT: v_or_b32_e32 v31, v20, v16 ; SDAG-NEXT: v_or_b32_e32 v32, v18, v22 -; SDAG-NEXT: .LBB3_6: ; %Flow16 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB3_6: ; %udiv-end1 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_or_b32_e32 v17, v13, v15 ; SDAG-NEXT: v_or_b32_e32 v16, v12, v14 ; SDAG-NEXT: v_or_b32_e32 v19, v5, v7 @@ -2644,7 +2716,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v26, v4 ; SDAG-NEXT: v_ffbh_u32_e32 v27, v5 ; SDAG-NEXT: v_mov_b32_e32 v28, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] ; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 @@ -2670,7 +2742,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc ; SDAG-NEXT: v_xor_b32_e32 v20, 0x7f, v16 ; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v20, v20, v18 @@ -2687,10 +2759,12 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v21, v5, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SDAG-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v20, v4, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_12 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v16 ; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 @@ -2710,19 +2784,20 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[16:17], v[4:5], v16 ; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 ; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 +; SDAG-NEXT: s_xor_b64 s[6:7], vcc, exec ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v19 ; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v17, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v16, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v26, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 ; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v7, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_11 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader ; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v34 ; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v34 @@ -2787,15 +2862,16 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v50, v34, v36 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[50:51] ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; SDAG-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 ; SDAG-NEXT: v_mov_b32_e32 v25, v21 ; SDAG-NEXT: v_mov_b32_e32 v24, v20 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] -; SDAG-NEXT: s_cbranch_execnz .LBB3_9 +; SDAG-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; SDAG-NEXT: s_cbranch_scc1 .LBB3_9 ; SDAG-NEXT: ; %bb.10: ; %Flow -; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB3_11: ; %Flow11 -; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v17 ; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 @@ -2804,8 +2880,8 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v21, v21, v17 ; SDAG-NEXT: v_or_b32_e32 v22, v22, v18 ; SDAG-NEXT: v_or_b32_e32 v20, v20, v16 -; SDAG-NEXT: .LBB3_12: ; %Flow12 -; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB3_12: ; %udiv-end ; SDAG-NEXT: v_mul_lo_u32 v18, v32, v11 ; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v32, v10, 0 ; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10 @@ -2866,6 +2942,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_urem_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v16, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v17, v9, v11 @@ -2919,14 +2996,16 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v20, v21, v20 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v22, 1, v20 +; GISEL-NEXT: v_and_b32_e32 v21, 1, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v21 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB3_6 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v30, vcc, 1, v16 ; GISEL-NEXT: v_addc_u32_e64 v31, s[4:5], 0, v17, vcc @@ -2945,19 +3024,21 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v17, vcc ; GISEL-NEXT: v_or_b32_e32 v16, v20, v18 ; GISEL-NEXT: v_or_b32_e32 v17, v21, v19 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v16, v24, v16, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v17, v25, v17, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 ; GISEL-NEXT: v_cndmask_b32_e32 v20, v16, v2, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v21, v17, v3, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v19, s11 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v16, s8 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB3_5 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 ; GISEL-NEXT: v_subrev_i32_e32 v26, vcc, 64, v30 ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v30 @@ -3018,26 +3099,28 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v27, v16, v9 ; GISEL-NEXT: v_and_b32_e32 v28, v16, v10 ; GISEL-NEXT: v_and_b32_e32 v16, v16, v11 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v19, v17 ; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v39, v27, vcc +; GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GISEL-NEXT: v_subb_u32_e32 v28, vcc, v18, v28, vcc ; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v16, vcc ; GISEL-NEXT: v_mov_b32_e32 v16, v24 ; GISEL-NEXT: v_mov_b32_e32 v17, v25 -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB3_3 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB3_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB3_5: ; %Flow14 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 ; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23 ; GISEL-NEXT: v_or_b32_e32 v20, v20, v22 ; GISEL-NEXT: v_or_b32_e32 v32, v16, v18 ; GISEL-NEXT: v_or_b32_e32 v33, v17, v19 -; GISEL-NEXT: .LBB3_6: ; %Flow16 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB3_6: ; %udiv-end1 +; GISEL-NEXT: s_mov_b64 s[12:13], exec ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v16, v12, v14 ; GISEL-NEXT: v_or_b32_e32 v17, v13, v15 @@ -3091,14 +3174,16 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v18, v19, v18 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 ; GISEL-NEXT: v_cndmask_b32_e64 v24, v4, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v26, 1, v18 +; GISEL-NEXT: v_and_b32_e32 v19, 1, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v25, v5, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v18, v6, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v19 +; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v19, v7, 0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB3_12 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 ; GISEL-NEXT: v_add_i32_e32 v34, vcc, 1, v16 ; GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v17, vcc @@ -3117,19 +3202,21 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v17, vcc ; GISEL-NEXT: v_or_b32_e32 v16, v22, v18 ; GISEL-NEXT: v_or_b32_e32 v17, v23, v19 +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GISEL-NEXT: v_cndmask_b32_e32 v16, v26, v16, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v17, v27, v17, vcc +; GISEL-NEXT: s_xor_b64 s[14:15], s[4:5], exec ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28 ; GISEL-NEXT: v_cndmask_b32_e32 v22, v16, v6, vcc +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e32 v23, v17, v7, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v19, s11 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v16, s8 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB3_11 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader ; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v34 ; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 64, v34 @@ -3190,26 +3277,27 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v29, v16, v13 ; GISEL-NEXT: v_and_b32_e32 v30, v16, v14 ; GISEL-NEXT: v_and_b32_e32 v50, v16, v15 +; GISEL-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17 ; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v51, v29, vcc ; GISEL-NEXT: v_mov_b32_e32 v16, v26 ; GISEL-NEXT: v_mov_b32_e32 v17, v27 +; GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GISEL-NEXT: v_subb_u32_e32 v30, vcc, v18, v30, vcc ; GISEL-NEXT: v_subb_u32_e32 v31, vcc, v31, v50, vcc -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GISEL-NEXT: s_cbranch_execnz .LBB3_9 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB3_9 ; GISEL-NEXT: ; %bb.10: ; %Flow -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB3_11: ; %Flow11 -; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[26:27], v[24:25], 1 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v25 ; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 ; GISEL-NEXT: v_or_b32_e32 v24, v16, v26 ; GISEL-NEXT: v_or_b32_e32 v25, v17, v27 -; GISEL-NEXT: .LBB3_12: ; %Flow12 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB3_12: ; %udiv-end ; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0 ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0 ; GISEL-NEXT: v_mul_lo_u32 v28, v8, v21 diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll index 757458363284c..0f02296104168 100644 --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: The checks for opt are NOT added by the update script. Those @@ -29,7 +30,6 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: s_branch .LBB0_3 ; ISA-NEXT: .LBB0_1: ; %Flow1 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; ISA-NEXT: s_or_b64 exec, exec, s[6:7] ; ISA-NEXT: s_mov_b64 s[6:7], 0 ; ISA-NEXT: .LBB0_2: ; %Flow ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 @@ -38,8 +38,10 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; ISA-NEXT: s_and_b64 s[6:7], s[6:7], exec ; ISA-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] -; ISA-NEXT: s_andn2_b64 exec, exec, s[0:1] -; ISA-NEXT: s_cbranch_execz .LBB0_6 +; ISA-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; ISA-NEXT: s_and_b64 s[10:11], s[6:7], -1 +; ISA-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; ISA-NEXT: s_cbranch_scc0 .LBB0_6 ; ISA-NEXT: .LBB0_3: ; %loop ; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 ; ISA-NEXT: s_or_b64 s[4:5], s[4:5], exec @@ -48,22 +50,29 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: s_cbranch_scc0 .LBB0_2 ; ISA-NEXT: ; %bb.4: ; %endif1 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; ISA-NEXT: s_and_b64 s[10:11], vcc, exec +; ISA-NEXT: s_and_b64 s[4:5], s[10:11], -1 +; ISA-NEXT: s_mov_b64 s[6:7], exec ; ISA-NEXT: s_mov_b64 s[4:5], -1 -; ISA-NEXT: s_and_saveexec_b64 s[6:7], vcc -; ISA-NEXT: s_cbranch_execz .LBB0_1 +; ISA-NEXT: s_cmov_b64 exec, s[10:11] +; ISA-NEXT: s_cbranch_scc0 .LBB0_1 ; ISA-NEXT: ; %bb.5: ; %endif2 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; ISA-NEXT: s_add_i32 s8, s8, 1 ; ISA-NEXT: s_xor_b64 s[4:5], exec, -1 +; ISA-NEXT: s_or_b64 exec, exec, s[6:7] ; ISA-NEXT: s_branch .LBB0_1 ; ISA-NEXT: .LBB0_6: ; %Flow2 -; ISA-NEXT: s_or_b64 exec, exec, s[0:1] +; ISA-NEXT: s_and_b64 s[2:3], s[2:3], exec +; ISA-NEXT: s_mov_b64 s[0:1], exec +; ISA-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; ISA-NEXT: v_mov_b32_e32 v1, 0 -; ISA-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] +; ISA-NEXT: s_cmov_b64 exec, s[2:3] +; ISA-NEXT: s_cbranch_scc0 .LBB0_8 ; ISA-NEXT: ; %bb.7: ; %if1 ; ISA-NEXT: v_sqrt_f32_e32 v1, v0 -; ISA-NEXT: ; %bb.8: ; %endloop ; ISA-NEXT: s_or_b64 exec, exec, s[0:1] +; ISA-NEXT: .LBB0_8: ; %endloop ; ISA-NEXT: exp mrt0 v1, v1, v1, v1 done vm ; ISA-NEXT: s_endpgm start: diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir index a1c3970a5bae9..9b8a1f7fa46f9 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir @@ -429,9 +429,9 @@ body: | bb.1: successors: %bb.2 + SI_WAVE_RECONVERGE %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: - SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ... # GCN-LABEL: name: old_in_diff_bb diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir index 29621a0477418..05627809615ac 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir @@ -383,9 +383,9 @@ body: | bb.1: successors: %bb.2 + SI_WAVE_RECONVERGE %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: - SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ... # GCN-LABEL: name: old_in_diff_bb diff --git a/llvm/test/CodeGen/AMDGPU/early-tailduplicator-terminator.mir b/llvm/test/CodeGen/AMDGPU/early-tailduplicator-terminator.mir index 41c6906b3c85a..a1cca8d61d0d0 100644 --- a/llvm/test/CodeGen/AMDGPU/early-tailduplicator-terminator.mir +++ b/llvm/test/CodeGen/AMDGPU/early-tailduplicator-terminator.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=early-tailduplication -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass=early-tailduplication -verify-machineinstrs -o - %s | FileCheck %s # Early tail duplication should not merge bb.6 into bb.5, adding a # non-terminator (S_SLEEP) after the terminator S_MOV_B32_term. @@ -22,19 +22,16 @@ body: | ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cond:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK-NEXT: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.3, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term %cond, $exec_lo, implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_MOV_B32_term]], %bb.3, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: - ; CHECK-NEXT: successors: %bb.5(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_SLEEP 2 @@ -46,13 +43,14 @@ body: | S_SLEEP 1 bb.3: + %cond:sreg_32_xm0_xexec = IMPLICIT_DEF %0:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo bb.4: - SI_WATERFALL_LOOP %bb.4, implicit $exec + %1:sreg_32_xm0_xexec = S_XOR_B32_term %cond, $exec_lo, implicit def $scc + SI_WATERFALL_LOOP %1, %0, %bb.4, implicit $exec bb.5: - $exec_lo = S_MOV_B32_term %0 bb.6: S_SLEEP 2 diff --git a/llvm/test/CodeGen/AMDGPU/else.ll b/llvm/test/CodeGen/AMDGPU/else.ll index 655c5cd184a1e..5b1751a261063 100644 --- a/llvm/test/CodeGen/AMDGPU/else.ll +++ b/llvm/test/CodeGen/AMDGPU/else.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s diff --git a/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll b/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll index 00c5e0abf6506..406359fbda703 100644 --- a/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll +++ b/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s ; This tests that the llvm.SI.end.cf intrinsic is not inserted into the diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll index 6ce3c68fce24e..de348b3118411 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=prologepilog -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; It is a small loop test that iterates over the array member of the structure argument passed byval to the function. diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll index 376fe79f542e3..bc14b433f067b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll @@ -67,8 +67,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) { ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: bb.2.atomicrmw.end: ; GFX940-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[FLAT_ATOMIC_CMPSWAP_RTN]], %bb.1 - ; GFX940-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 - ; GFX940-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX940-NEXT: $vgpr0 = COPY [[PHI2]] ; GFX940-NEXT: SI_RETURN implicit $vgpr0 %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst @@ -105,8 +103,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) { ; GFX940-NEXT: S_BRANCH %bb.2 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: bb.2.atomicrmw.end: - ; GFX940-NEXT: [[PHI2:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 - ; GFX940-NEXT: SI_END_CF [[PHI2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX940-NEXT: SI_RETURN %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 5bd527149572e..555280894acf6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -1773,11 +1773,12 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB50_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB50_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret: @@ -1795,11 +1796,12 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB50_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB50_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret: @@ -1817,11 +1819,12 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB50_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB50_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr %ptr, i32 %in seq_cst ret void @@ -1845,11 +1848,12 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB51_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB51_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset: @@ -1869,11 +1873,12 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB51_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB51_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset: @@ -1891,11 +1896,12 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB51_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB51_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst @@ -1919,10 +1925,11 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB52_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB52_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -1942,10 +1949,11 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB52_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB52_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -1965,10 +1973,11 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB52_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB52_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i32 %in seq_cst @@ -1994,10 +2003,11 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB53_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB53_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset: @@ -2018,10 +2028,11 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB53_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB53_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset: @@ -2040,10 +2051,11 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB53_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB53_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 @@ -2069,11 +2081,12 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB54_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB54_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret_scalar: @@ -2093,11 +2106,12 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB54_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB54_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret_scalar: @@ -2117,11 +2131,12 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB54_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB54_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr %ptr, i32 %in seq_cst ret void @@ -2147,11 +2162,12 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB55_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset_scalar: @@ -2173,11 +2189,12 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB55_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset_scalar: @@ -2197,11 +2214,12 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB55_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst @@ -2229,10 +2247,11 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB56_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB56_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_ret_scalar: @@ -2255,10 +2274,11 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB56_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB56_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_ret_scalar: @@ -2281,10 +2301,11 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB56_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB56_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2311,10 +2332,11 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB57_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB57_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset_scalar: @@ -2337,10 +2359,11 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB57_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB57_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset_scalar: @@ -2363,10 +2386,11 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB57_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB57_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw nand ptr %gep, i32 %in seq_cst @@ -2391,11 +2415,12 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB58_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB58_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory: @@ -2415,11 +2440,12 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB58_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB58_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory: @@ -2437,11 +2463,12 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB58_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB58_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2467,10 +2494,11 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB59_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB59_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory: @@ -2491,10 +2519,11 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB59_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB59_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory: @@ -2513,10 +2542,11 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB59_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB59_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 @@ -3243,11 +3273,12 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB80_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB80_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_noret: @@ -3264,11 +3295,12 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB80_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB80_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_noret: @@ -3285,11 +3317,12 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB80_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB80_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst ret void @@ -3312,11 +3345,12 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB81_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB81_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_noret_offset: @@ -3335,11 +3369,12 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB81_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB81_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_noret_offset: @@ -3356,11 +3391,12 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB81_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB81_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst @@ -3383,10 +3419,11 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB82_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB82_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -3405,10 +3442,11 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB82_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB82_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -3427,10 +3465,11 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB82_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB82_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw max ptr %ptr, i32 %in seq_cst @@ -3455,10 +3494,11 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB83_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB83_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_ret_offset: @@ -3478,10 +3518,11 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB83_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB83_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_ret_offset: @@ -3499,10 +3540,11 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB83_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB83_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 @@ -3527,11 +3569,12 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB84_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_noret_scalar: @@ -3550,11 +3593,12 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB84_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_noret_scalar: @@ -3573,11 +3617,12 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB84_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst ret void @@ -3602,11 +3647,12 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB85_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_noret_offset_scalar: @@ -3627,11 +3673,12 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB85_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_noret_offset_scalar: @@ -3650,11 +3697,12 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB85_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst @@ -3681,10 +3729,11 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB86_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB86_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_ret_scalar: @@ -3706,10 +3755,11 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB86_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB86_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_ret_scalar: @@ -3731,10 +3781,11 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB86_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB86_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw max ptr %ptr, i32 %in seq_cst ret i32 %result @@ -3760,10 +3811,11 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB87_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_ret_offset_scalar: @@ -3785,10 +3837,11 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB87_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_ret_offset_scalar: @@ -3810,10 +3863,11 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB87_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB87_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw max ptr %gep, i32 %in seq_cst @@ -3845,9 +3899,11 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB88_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -3875,9 +3931,11 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB88_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -3903,9 +3961,11 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB88_1 +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -3942,10 +4002,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB89_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 @@ -3977,10 +4038,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB89_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 @@ -4010,10 +4072,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB89_1 +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4049,9 +4112,11 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB90_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4077,9 +4142,11 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB90_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4105,9 +4172,11 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB90_1 +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -4141,10 +4210,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB91_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 @@ -4174,10 +4244,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB91_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 @@ -4207,10 +4278,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB91_1 +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4239,11 +4311,12 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB92_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB92_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory: @@ -4262,11 +4335,12 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB92_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB92_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory: @@ -4283,11 +4357,12 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB92_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB92_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -4312,10 +4387,11 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB93_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB93_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: @@ -4335,10 +4411,11 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB93_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB93_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: @@ -4356,10 +4433,11 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB93_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 @@ -4386,11 +4464,12 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB94_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_noret: @@ -4407,11 +4486,12 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB94_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_noret: @@ -4428,11 +4508,12 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB94_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst ret void @@ -4455,11 +4536,12 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB95_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB95_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_noret_offset: @@ -4478,11 +4560,12 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB95_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB95_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_noret_offset: @@ -4499,11 +4582,12 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB95_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB95_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst @@ -4526,10 +4610,11 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB96_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB96_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -4548,10 +4633,11 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB96_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB96_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -4570,10 +4656,11 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB96_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB96_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umax ptr %ptr, i32 %in seq_cst @@ -4598,10 +4685,11 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB97_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB97_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset: @@ -4621,10 +4709,11 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB97_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB97_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset: @@ -4642,10 +4731,11 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB97_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB97_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 @@ -4670,11 +4760,12 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB98_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_noret_scalar: @@ -4693,11 +4784,12 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB98_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_noret_scalar: @@ -4716,11 +4808,12 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB98_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst ret void @@ -4745,11 +4838,12 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB99_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_noret_offset_scalar: @@ -4770,11 +4864,12 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB99_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_noret_offset_scalar: @@ -4793,11 +4888,12 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB99_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst @@ -4824,10 +4920,11 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB100_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB100_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_ret_scalar: @@ -4849,10 +4946,11 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB100_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB100_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_ret_scalar: @@ -4874,10 +4972,11 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB100_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB100_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umax ptr %ptr, i32 %in seq_cst ret i32 %result @@ -4903,10 +5002,11 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB101_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset_scalar: @@ -4928,10 +5028,11 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB101_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset_scalar: @@ -4953,10 +5054,11 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB101_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB101_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw umax ptr %gep, i32 %in seq_cst @@ -4988,9 +5090,11 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB102_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -5018,9 +5122,11 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB102_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -5046,9 +5152,11 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB102_1 +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -5085,10 +5193,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB103_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 @@ -5120,10 +5229,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB103_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 @@ -5153,10 +5263,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB103_1 +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -5194,10 +5305,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB104_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 @@ -5227,10 +5339,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB104_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 @@ -5260,10 +5373,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB104_1 +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -5292,11 +5406,12 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB105_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory: @@ -5315,11 +5430,12 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB105_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory: @@ -5336,11 +5452,12 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB105_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -5365,10 +5482,11 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB106_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: @@ -5388,10 +5506,11 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB106_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: @@ -5409,10 +5528,11 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB106_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB106_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 @@ -5439,11 +5559,12 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB107_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB107_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_noret: @@ -5460,11 +5581,12 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB107_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB107_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_noret: @@ -5481,11 +5603,12 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB107_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB107_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr %ptr, i32 %in seq_cst ret void @@ -5508,11 +5631,12 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB108_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB108_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_noret_offset: @@ -5531,11 +5655,12 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB108_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB108_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_noret_offset: @@ -5552,11 +5677,12 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB108_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB108_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst @@ -5579,10 +5705,11 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB109_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB109_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -5601,10 +5728,11 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB109_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB109_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -5623,10 +5751,11 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB109_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB109_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umin ptr %ptr, i32 %in seq_cst @@ -5651,10 +5780,11 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB110_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB110_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset: @@ -5674,10 +5804,11 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB110_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB110_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset: @@ -5695,10 +5826,11 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB110_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB110_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 @@ -5723,11 +5855,12 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB111_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_noret_scalar: @@ -5746,11 +5879,12 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB111_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_noret_scalar: @@ -5769,11 +5903,12 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB111_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr %ptr, i32 %in seq_cst ret void @@ -5798,11 +5933,12 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB112_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_noret_offset_scalar: @@ -5823,11 +5959,12 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB112_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_noret_offset_scalar: @@ -5846,11 +5983,12 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB112_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst @@ -5877,10 +6015,11 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB113_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB113_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_ret_scalar: @@ -5902,10 +6041,11 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB113_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB113_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_ret_scalar: @@ -5927,10 +6067,11 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB113_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB113_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umin ptr %ptr, i32 %in seq_cst ret i32 %result @@ -5956,10 +6097,11 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB114_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset_scalar: @@ -5981,10 +6123,11 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB114_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset_scalar: @@ -6006,10 +6149,11 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB114_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB114_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw umin ptr %gep, i32 %in seq_cst @@ -6033,11 +6177,12 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB115_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory: @@ -6056,11 +6201,12 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB115_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory: @@ -6077,11 +6223,12 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB115_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6106,10 +6253,11 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB116_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: @@ -6129,10 +6277,11 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB116_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: @@ -6150,10 +6299,11 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB116_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 @@ -6180,11 +6330,12 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB117_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB117_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_noret: @@ -6201,11 +6352,12 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB117_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB117_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_noret: @@ -6222,11 +6374,12 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB117_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB117_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst ret void @@ -6249,11 +6402,12 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB118_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB118_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_noret_offset: @@ -6272,11 +6426,12 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB118_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB118_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_noret_offset: @@ -6293,11 +6448,12 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB118_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB118_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst @@ -6320,10 +6476,11 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB119_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB119_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -6342,10 +6499,11 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB119_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB119_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -6364,10 +6522,11 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB119_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB119_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw min ptr %ptr, i32 %in seq_cst @@ -6392,10 +6551,11 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB120_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB120_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_ret_offset: @@ -6415,10 +6575,11 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB120_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB120_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_ret_offset: @@ -6436,10 +6597,11 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB120_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB120_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 @@ -6464,11 +6626,12 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB121_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_noret_scalar: @@ -6487,11 +6650,12 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB121_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_noret_scalar: @@ -6510,11 +6674,12 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB121_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst ret void @@ -6539,11 +6704,12 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB122_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_noret_offset_scalar: @@ -6564,11 +6730,12 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB122_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_noret_offset_scalar: @@ -6587,11 +6754,12 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB122_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst @@ -6618,10 +6786,11 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB123_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB123_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_ret_scalar: @@ -6643,10 +6812,11 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB123_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB123_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_ret_scalar: @@ -6668,10 +6838,11 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB123_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB123_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw min ptr %ptr, i32 %in seq_cst ret i32 %result @@ -6697,10 +6868,11 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB124_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_ret_offset_scalar: @@ -6722,10 +6894,11 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB124_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_ret_offset_scalar: @@ -6747,10 +6920,11 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB124_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB124_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw min ptr %gep, i32 %in seq_cst @@ -6782,9 +6956,11 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB125_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -6812,9 +6988,11 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB125_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -6840,9 +7018,11 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB125_1 +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -6879,10 +7059,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB126_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 @@ -6914,10 +7095,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB126_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 @@ -6947,10 +7129,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB126_1 +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -6982,9 +7165,11 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB127_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -7006,9 +7191,11 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB127_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -7030,9 +7217,11 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB127_1 +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -7065,10 +7254,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB128_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 @@ -7098,10 +7288,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB128_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 @@ -7131,10 +7322,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB128_1 +; GCN3-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -7163,11 +7355,12 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB129_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB129_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory: @@ -7186,11 +7379,12 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB129_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB129_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory: @@ -7207,11 +7401,12 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB129_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB129_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7236,10 +7431,11 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB130_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB130_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: @@ -7259,10 +7455,11 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB130_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB130_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: @@ -7280,10 +7477,11 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB130_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB130_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index d812b4b7d86e6..eca5f1f11c09a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -1839,11 +1839,12 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB50_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB50_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret: @@ -1867,11 +1868,12 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB50_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB50_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret: @@ -1892,11 +1894,12 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB50_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB50_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst ret void @@ -1926,11 +1929,12 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB51_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB51_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset: @@ -1956,11 +1960,12 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB51_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB51_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset: @@ -1981,11 +1986,12 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB51_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB51_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst @@ -2015,10 +2021,11 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB52_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB52_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2045,10 +2052,11 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB52_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB52_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2072,10 +2080,11 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB52_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB52_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2108,10 +2117,11 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB53_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB53_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset: @@ -2138,10 +2148,11 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB53_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB53_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset: @@ -2163,10 +2174,11 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB53_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB53_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2203,11 +2215,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB54_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB54_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret_scalar: @@ -2237,11 +2250,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB54_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB54_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret_scalar: @@ -2266,11 +2280,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB54_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB54_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst ret void @@ -2304,11 +2319,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB55_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar: @@ -2338,11 +2354,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB55_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar: @@ -2367,11 +2384,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB55_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst @@ -2407,10 +2425,11 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB56_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB56_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret_scalar: @@ -2441,10 +2460,11 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB56_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB56_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret_scalar: @@ -2470,10 +2490,11 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB56_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB56_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2508,10 +2529,11 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB57_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB57_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar: @@ -2542,10 +2564,11 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB57_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB57_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar: @@ -2571,10 +2594,11 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB57_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB57_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst @@ -2605,11 +2629,12 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB58_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB58_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: @@ -2635,11 +2660,12 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB58_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB58_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: @@ -2660,11 +2686,12 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB58_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB58_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2696,10 +2723,11 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB59_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB59_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: @@ -2726,10 +2754,11 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB59_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB59_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: @@ -2751,10 +2780,11 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB59_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB59_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3512,11 +3542,12 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB80_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB80_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret: @@ -3539,11 +3570,12 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB80_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB80_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret: @@ -3563,11 +3595,12 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB80_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB80_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst ret void @@ -3596,11 +3629,12 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB81_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB81_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret_offset: @@ -3625,11 +3659,12 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB81_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB81_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret_offset: @@ -3649,11 +3684,12 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB81_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB81_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst @@ -3682,10 +3718,11 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB82_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB82_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3711,10 +3748,11 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB82_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB82_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3737,10 +3775,11 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB82_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB82_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3772,10 +3811,11 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB83_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB83_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret_offset: @@ -3801,10 +3841,11 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB83_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB83_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret_offset: @@ -3825,10 +3866,11 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB83_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB83_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3866,11 +3908,12 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB84_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret_scalar: @@ -3901,11 +3944,12 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB84_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret_scalar: @@ -3931,11 +3975,12 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB84_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst ret void @@ -3970,11 +4015,12 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB85_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret_offset_scalar: @@ -4005,11 +4051,12 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB85_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret_offset_scalar: @@ -4035,11 +4082,12 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB85_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst @@ -4076,10 +4124,11 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB86_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB86_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret_scalar: @@ -4111,10 +4160,11 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB86_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB86_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret_scalar: @@ -4141,10 +4191,11 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB86_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB86_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw max ptr %ptr, i64 %in seq_cst ret i64 %result @@ -4180,10 +4231,11 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB87_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret_offset_scalar: @@ -4215,10 +4267,11 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB87_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret_offset_scalar: @@ -4245,10 +4298,11 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB87_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB87_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst @@ -4284,9 +4338,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB88_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4318,9 +4374,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB88_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4350,9 +4408,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB88_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -4391,10 +4451,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB89_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4428,10 +4489,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB89_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4463,10 +4525,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB89_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4506,9 +4569,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB90_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4538,9 +4603,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB90_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4570,9 +4637,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB90_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -4608,10 +4677,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB91_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4643,10 +4713,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB91_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4678,10 +4749,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB91_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4716,11 +4788,12 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB92_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB92_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: @@ -4745,11 +4818,12 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB92_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB92_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: @@ -4769,11 +4843,12 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB92_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB92_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -4804,10 +4879,11 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB93_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB93_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: @@ -4833,10 +4909,11 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB93_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB93_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: @@ -4857,10 +4934,11 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB93_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4894,11 +4972,12 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB94_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret: @@ -4921,11 +5000,12 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB94_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret: @@ -4945,11 +5025,12 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB94_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst ret void @@ -4978,11 +5059,12 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB95_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB95_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset: @@ -5007,11 +5089,12 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB95_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB95_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset: @@ -5031,11 +5114,12 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB95_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB95_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst @@ -5064,10 +5148,11 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB96_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB96_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5093,10 +5178,11 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB96_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB96_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5119,10 +5205,11 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB96_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB96_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5154,10 +5241,11 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB97_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB97_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset: @@ -5183,10 +5271,11 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB97_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB97_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset: @@ -5207,10 +5296,11 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB97_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB97_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5248,11 +5338,12 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB98_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret_scalar: @@ -5283,11 +5374,12 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB98_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret_scalar: @@ -5313,11 +5405,12 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB98_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst ret void @@ -5352,11 +5445,12 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB99_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset_scalar: @@ -5387,11 +5481,12 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB99_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset_scalar: @@ -5417,11 +5512,12 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB99_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst @@ -5458,10 +5554,11 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB100_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB100_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret_scalar: @@ -5493,10 +5590,11 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB100_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB100_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret_scalar: @@ -5523,10 +5621,11 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB100_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB100_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umax ptr %ptr, i64 %in seq_cst ret i64 %result @@ -5562,10 +5661,11 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB101_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset_scalar: @@ -5597,10 +5697,11 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB101_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset_scalar: @@ -5627,10 +5728,11 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB101_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB101_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst @@ -5666,9 +5768,11 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB102_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -5700,9 +5804,11 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB102_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -5732,9 +5838,11 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB102_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -5773,10 +5881,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB103_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5810,10 +5919,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB103_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5845,10 +5955,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB103_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5888,10 +5999,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB104_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5923,10 +6035,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB104_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5958,10 +6071,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB104_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5996,11 +6110,12 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB105_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: @@ -6025,11 +6140,12 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB105_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: @@ -6049,11 +6165,12 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB105_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6084,10 +6201,11 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB106_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: @@ -6113,10 +6231,11 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB106_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: @@ -6137,10 +6256,11 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB106_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB106_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6174,11 +6294,12 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB107_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB107_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret: @@ -6201,11 +6322,12 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB107_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB107_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret: @@ -6225,11 +6347,12 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB107_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB107_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst ret void @@ -6258,11 +6381,12 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB108_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB108_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset: @@ -6287,11 +6411,12 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB108_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB108_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset: @@ -6311,11 +6436,12 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB108_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB108_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst @@ -6344,10 +6470,11 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB109_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB109_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6373,10 +6500,11 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB109_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB109_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6399,10 +6527,11 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB109_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB109_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6434,10 +6563,11 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB110_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB110_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset: @@ -6463,10 +6593,11 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB110_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB110_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset: @@ -6487,10 +6618,11 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB110_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB110_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6528,11 +6660,12 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB111_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret_scalar: @@ -6563,11 +6696,12 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB111_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret_scalar: @@ -6593,11 +6727,12 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB111_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst ret void @@ -6632,11 +6767,12 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB112_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset_scalar: @@ -6667,11 +6803,12 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB112_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset_scalar: @@ -6697,11 +6834,12 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB112_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst @@ -6738,10 +6876,11 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB113_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB113_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_ret_scalar: @@ -6773,10 +6912,11 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB113_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB113_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret_scalar: @@ -6803,10 +6943,11 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB113_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB113_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umin ptr %ptr, i64 %in seq_cst ret i64 %result @@ -6842,10 +6983,11 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB114_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset_scalar: @@ -6877,10 +7019,11 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB114_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset_scalar: @@ -6907,10 +7050,11 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB114_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB114_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst @@ -6940,11 +7084,12 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB115_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: @@ -6969,11 +7114,12 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB115_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: @@ -6993,11 +7139,12 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB115_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7028,10 +7175,11 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB116_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: @@ -7057,10 +7205,11 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB116_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: @@ -7081,10 +7230,11 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB116_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -7118,11 +7268,12 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB117_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB117_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret: @@ -7145,11 +7296,12 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB117_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB117_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret: @@ -7169,11 +7321,12 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB117_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB117_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst ret void @@ -7202,11 +7355,12 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB118_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB118_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret_offset: @@ -7231,11 +7385,12 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB118_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB118_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret_offset: @@ -7255,11 +7410,12 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB118_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB118_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst @@ -7288,10 +7444,11 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB119_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB119_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7317,10 +7474,11 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB119_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB119_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7343,10 +7501,11 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB119_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB119_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -7378,10 +7537,11 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB120_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB120_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret_offset: @@ -7407,10 +7567,11 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB120_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB120_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret_offset: @@ -7431,10 +7592,11 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB120_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB120_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -7472,11 +7634,12 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB121_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret_scalar: @@ -7507,11 +7670,12 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB121_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret_scalar: @@ -7537,11 +7701,12 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB121_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst ret void @@ -7576,11 +7741,12 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB122_1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret_offset_scalar: @@ -7611,11 +7777,12 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB122_1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret_offset_scalar: @@ -7641,11 +7808,12 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB122_1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst @@ -7682,10 +7850,11 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB123_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB123_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret_scalar: @@ -7717,10 +7886,11 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB123_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB123_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret_scalar: @@ -7747,10 +7917,11 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB123_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB123_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw min ptr %ptr, i64 %in seq_cst ret i64 %result @@ -7786,10 +7957,11 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB124_1 +; GCN1-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN1-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN1-NEXT: s_cbranch_scc1 .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret_offset_scalar: @@ -7821,10 +7993,11 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB124_1 +; GCN2-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN2-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN2-NEXT: s_cbranch_scc1 .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret_offset_scalar: @@ -7851,10 +8024,11 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB124_1 +; GCN3-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GCN3-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GCN3-NEXT: s_cbranch_scc1 .LBB124_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst @@ -7890,9 +8064,11 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB125_1 +; GCN1-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -7924,9 +8100,11 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB125_1 +; GCN2-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -7956,9 +8134,11 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB125_1 +; GCN3-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -7997,10 +8177,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB126_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8034,10 +8215,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB126_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8069,10 +8251,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB126_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8110,9 +8293,11 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[6:7], s[0:1], -1 ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB127_1 +; GCN1-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -8140,9 +8325,11 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[6:7], s[0:1], -1 ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB127_1 +; GCN2-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -8170,9 +8357,11 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[6:7], s[0:1], -1 ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB127_1 +; GCN3-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -8207,10 +8396,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB128_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN1-NEXT: s_cbranch_scc1 .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8242,10 +8432,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB128_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN2-NEXT: s_cbranch_scc1 .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8277,10 +8468,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB128_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN3-NEXT: s_cbranch_scc1 .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8315,11 +8507,12 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB129_1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB129_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: @@ -8344,11 +8537,12 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB129_1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB129_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: @@ -8368,11 +8562,12 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB129_1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB129_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -8403,10 +8598,11 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB130_1 +; GCN1-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN1-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN1-NEXT: s_cbranch_scc1 .LBB130_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: @@ -8432,10 +8628,11 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB130_1 +; GCN2-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN2-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN2-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN2-NEXT: s_cbranch_scc1 .LBB130_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: @@ -8456,10 +8653,11 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB130_1 +; GCN3-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN3-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN3-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN3-NEXT: s_cbranch_scc1 .LBB130_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index b32630a97b3ad..b41ee12ba5939 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -3997,10 +3997,12 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, v2, v3 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_cbranch_execz .LBB81_2 +; SI-NEXT: s_mov_b64 s[4:5], exec +; SI-NEXT: s_and_b64 s[6:7], vcc, -1 +; SI-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB81_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: v_cvt_f16_f32_e64 v3, -v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -4010,8 +4012,8 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: flat_store_short v[0:1], v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: .LBB81_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: .LBB81_2: ; %endif ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: flat_store_short v[0:1], v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -4024,16 +4026,18 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half ; VI-NEXT: v_lshlrev_b32_e32 v6, 1, v6 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f16_e32 v2, v2, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB81_2 +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: s_and_b64 s[6:7], vcc, -1 +; VI-NEXT: v_mul_f16_e32 v2, v2, v3 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB81_2 ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: v_mul_f16_e64 v3, -v2, v4 ; VI-NEXT: flat_store_short v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: .LBB81_2: ; %endif ; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB81_2: ; %endif ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4042,20 +4046,23 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v6, 0x3ff, v31 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 ; GFX11-NEXT: v_mul_f16_e32 v2, v2, v3 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 1, v6 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v5 -; GFX11-NEXT: s_cbranch_execz .LBB81_2 +; GFX11-NEXT: v_add_co_u32 v0, s0, v0, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, v1, s0 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB81_2 ; GFX11-NEXT: ; %bb.1: ; %if ; GFX11-NEXT: v_mul_f16_e64 v3, -v2, v4 ; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: .LBB81_2: ; %endif -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index b5440b9c38c9f..bd0cd6d1d5c4f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2380,16 +2380,18 @@ define void @v_fneg_copytoreg_f32(ptr addrspace(1) %out, float %a, float %b, flo ; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v2, v2, v3 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_cbranch_execz .LBB118_2 +; SI-NEXT: s_mov_b64 s[4:5], exec +; SI-NEXT: s_and_b64 s[6:7], vcc, -1 +; SI-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB118_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: v_mul_f32_e64 v3, -v2, v4 ; SI-NEXT: flat_store_dword v[0:1], v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: .LBB118_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: .LBB118_2: ; %endif ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2401,16 +2403,18 @@ define void @v_fneg_copytoreg_f32(ptr addrspace(1) %out, float %a, float %b, flo ; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v6 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v2, v2, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB118_2 +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: s_and_b64 s[6:7], vcc, -1 +; VI-NEXT: v_mul_f32_e32 v2, v2, v3 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB118_2 ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: v_mul_f32_e64 v3, -v2, v4 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: .LBB118_2: ; %endif ; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB118_2: ; %endif ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll index a04bf44549325..a3ae039e52e61 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll @@ -9,11 +9,14 @@ define float @fold_abs_in_branch(float %arg1, float %arg2) { ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX10-NEXT: v_add_f32_e64 v0, |v1|, |v1| -; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 +; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 1.0, v0 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |v1| -; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB0_2: ; %exit ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2 @@ -40,11 +43,14 @@ define float @fold_abs_in_branch_multiple_users(float %arg1, float %arg2) { ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_add_f32_e64 v1, |v0|, |v0| -; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v1 +; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 1.0, v1 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_mul_f32_e64 v1, 0x3e4ccccd, |v0| -; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB1_2: ; %exit ; GFX10-NEXT: v_add_f32_e64 v0, |v0|, 2.0 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -125,11 +131,14 @@ define float @fold_abs_in_branch_fabs(float %arg1, float %arg2) { ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX10-NEXT: v_add_f32_e64 v0, |v1|, |v1| -; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 +; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 1.0, v0 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |v1| -; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB4_2: ; %exit ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2 @@ -157,8 +166,10 @@ define float @fold_abs_in_branch_phi(float %arg1, float %arg2) { ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_add_f32_e64 v0, |v0|, |v0| -; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 -; GFX10-NEXT: s_cbranch_execz .LBB5_3 +; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 1.0, v0 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX10-NEXT: ; %bb.1: ; %header.preheader ; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: .LBB5_2: ; %header @@ -167,8 +178,9 @@ define float @fold_abs_in_branch_phi(float %arg1, float %arg2) { ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, -1.0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX10-NEXT: s_cbranch_vccnz .LBB5_2 -; GFX10-NEXT: .LBB5_3: ; %Flow1 +; GFX10-NEXT: ; %bb.3: ; %Flow ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB5_4: ; %exit ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2 @@ -201,13 +213,16 @@ define float @fold_neg_in_branch(float %arg1, float %arg2) { ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 1.0, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_rcp_f32_e64 v1, -v0 ; GFX10-NEXT: v_mul_f32_e64 v1, |v0|, v1 -; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: .LBB6_2: ; %exit ; GFX10-NEXT: v_mul_f32_e64 v0, -v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 121fab51024fd..ff0def8b4df86 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1185,8 +1185,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB42_3 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB42_3 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] @@ -1208,9 +1209,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX90A-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_2 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX90A-NEXT: s_cbranch_scc1 .LBB42_2 ; GFX90A-NEXT: .LBB42_3: ; GFX90A-NEXT: s_endpgm ; @@ -1220,8 +1223,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB42_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB42_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1247,8 +1251,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB43_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB43_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1268,8 +1273,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB43_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB43_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1295,8 +1301,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB44_3 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB44_3 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] @@ -1318,9 +1325,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX90A-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_2 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX90A-NEXT: s_cbranch_scc1 .LBB44_2 ; GFX90A-NEXT: .LBB44_3: ; GFX90A-NEXT: s_endpgm ; @@ -1330,8 +1339,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB44_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB44_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1357,8 +1367,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB45_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB45_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1378,8 +1389,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB45_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB45_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1435,10 +1447,11 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1500,10 +1513,11 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1567,8 +1581,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB52_3 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB52_3 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] @@ -1588,9 +1603,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX90A-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_2 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX90A-NEXT: s_cbranch_scc1 .LBB52_2 ; GFX90A-NEXT: .LBB52_3: ; GFX90A-NEXT: s_endpgm ; @@ -1600,8 +1617,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB52_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB52_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1640,9 +1658,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX90A-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX90A-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX90A-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -1712,9 +1732,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX90A-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX90A-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX90A-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -1752,10 +1774,11 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1818,10 +1841,11 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1905,9 +1929,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX90A-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX90A-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX90A-NEXT: s_cbranch_scc1 .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -2122,8 +2148,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB70_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB70_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2142,8 +2169,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB70_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB70_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2167,8 +2195,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB71_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB71_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2187,8 +2216,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB71_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB71_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2212,8 +2242,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB72_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB72_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2232,8 +2263,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB72_2 +; GFX940-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX940-NEXT: s_cmov_b64 exec, vcc +; GFX940-NEXT: s_cbranch_scc0 .LBB72_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 99818df6175bd..c6f5230ee398c 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -10,14 +10,16 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_bfe_u32 v6, v5, 20, 11 ; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0x3fe -; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SDAG-NEXT: s_cbranch_execz .LBB0_10 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc @@ -29,26 +31,29 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] ; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[10:11], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB0_7 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 ; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 -; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] +; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 +; SDAG-NEXT: s_xor_b64 s[12:13], s[4:5], exec ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB0_4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6 ; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6 @@ -90,9 +95,12 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB0_4: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB0_6 +; SDAG-NEXT: s_xor_b64 s[14:15], s[12:13], exec +; SDAG-NEXT: s_and_b64 s[4:5], s[12:13], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[12:13] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] @@ -114,10 +122,14 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] ; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] ; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 +; SDAG-NEXT: s_or_b64 exec, exec, s[14:15] ; SDAG-NEXT: .LBB0_6: ; %Flow1 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[10:11], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_9 ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 @@ -125,10 +137,10 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 -; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup +; SDAG-NEXT: .LBB0_9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fptosi_f64_to_i128: @@ -139,17 +151,19 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5 ; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB0_10 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_10 ; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 @@ -166,10 +180,11 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_xor_b64 s[14:15], vcc, exec +; GISEL-NEXT: s_and_b64 s[6:7], vcc, -1 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB0_7 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_7 ; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] @@ -220,57 +235,61 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0 -; GISEL-NEXT: v_or3_b32 v8, v1, v2, 1 -; GISEL-NEXT: v_or3_b32 v9, v0, v2, 0 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GISEL-NEXT: v_or3_b32 v8, v1, v3, 1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x433 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] -; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xfffff, v5 +; GISEL-NEXT: s_xor_b64 s[16:17], vcc, exec +; GISEL-NEXT: v_lshl_or_b32 v9, v2, 16, v2 +; GISEL-NEXT: v_or3_b32 v10, v2, v3, 0 +; GISEL-NEXT: s_and_b64 s[6:7], vcc, -1 +; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB0_4 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0 ; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v10, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 ; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v10, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v10, v[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr10 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr10 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: s_or_b64 exec, exec, s[16:17] ; GISEL-NEXT: .LBB0_4: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] -; GISEL-NEXT: s_cbranch_execz .LBB0_6 +; GISEL-NEXT: s_xor_b64 s[8:9], s[16:17], exec +; GISEL-NEXT: s_and_b64 s[6:7], s[16:17], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[16:17] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, 0x433, v6 ; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v6 @@ -281,21 +300,24 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] -; GISEL-NEXT: v_mul_lo_u32 v6, v5, v10 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v10, v[2:3] +; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v10, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9 ; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc -; GISEL-NEXT: .LBB0_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB0_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB0_7: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] -; GISEL-NEXT: s_cbranch_execz .LBB0_9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[14:15], exec +; GISEL-NEXT: s_and_b64 s[8:9], s[14:15], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[14:15] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_9 ; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 @@ -365,10 +387,10 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: .LBB0_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: .LBB0_10: ; %fp-to-i-cleanup +; GISEL-NEXT: .LBB0_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB0_10: ; %fp-to-i-cleanup ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptosi double %x to i128 ret i128 %cvt @@ -382,14 +404,16 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_bfe_u32 v6, v5, 20, 11 ; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: s_mov_b64 s[4:5], 0x3fe -; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SDAG-NEXT: s_cbranch_execz .LBB1_10 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc @@ -401,26 +425,29 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] ; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[10:11], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB1_7 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 ; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 -; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] +; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 +; SDAG-NEXT: s_xor_b64 s[12:13], s[4:5], exec ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB1_4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6 ; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6 @@ -462,9 +489,12 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB1_4: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB1_6 +; SDAG-NEXT: s_xor_b64 s[14:15], s[12:13], exec +; SDAG-NEXT: s_and_b64 s[4:5], s[12:13], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[12:13] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] @@ -486,10 +516,14 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] ; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] ; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 +; SDAG-NEXT: s_or_b64 exec, exec, s[14:15] ; SDAG-NEXT: .LBB1_6: ; %Flow1 -; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB1_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[10:11], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_9 ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 @@ -497,10 +531,10 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 -; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup +; SDAG-NEXT: .LBB1_9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fptoui_f64_to_i128: @@ -511,17 +545,19 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5 ; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB1_10 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_10 ; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 @@ -538,10 +574,11 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_xor_b64 s[14:15], vcc, exec +; GISEL-NEXT: s_and_b64 s[6:7], vcc, -1 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB1_7 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_7 ; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] @@ -592,57 +629,61 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0 -; GISEL-NEXT: v_or3_b32 v8, v1, v2, 1 -; GISEL-NEXT: v_or3_b32 v9, v0, v2, 0 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GISEL-NEXT: v_or3_b32 v8, v1, v3, 1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x433 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] -; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xfffff, v5 +; GISEL-NEXT: s_xor_b64 s[16:17], vcc, exec +; GISEL-NEXT: v_lshl_or_b32 v9, v2, 16, v2 +; GISEL-NEXT: v_or3_b32 v10, v2, v3, 0 +; GISEL-NEXT: s_and_b64 s[6:7], vcc, -1 +; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB1_4 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0 ; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v10, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 ; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v10, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v10, v[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr10 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr10 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: s_or_b64 exec, exec, s[16:17] ; GISEL-NEXT: .LBB1_4: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] -; GISEL-NEXT: s_cbranch_execz .LBB1_6 +; GISEL-NEXT: s_xor_b64 s[8:9], s[16:17], exec +; GISEL-NEXT: s_and_b64 s[6:7], s[16:17], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[16:17] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, 0x433, v6 ; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v6 @@ -653,21 +694,24 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] -; GISEL-NEXT: v_mul_lo_u32 v6, v5, v10 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v10, v[2:3] +; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v10, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9 ; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc -; GISEL-NEXT: .LBB1_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB1_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB1_7: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] -; GISEL-NEXT: s_cbranch_execz .LBB1_9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[14:15], exec +; GISEL-NEXT: s_and_b64 s[8:9], s[14:15], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[14:15] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_9 ; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 @@ -737,10 +781,10 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: .LBB1_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: .LBB1_10: ; %fp-to-i-cleanup +; GISEL-NEXT: .LBB1_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB1_10: ; %fp-to-i-cleanup ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptoui double %x to i128 ret i128 %cvt @@ -753,14 +797,16 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SDAG-NEXT: s_cbranch_execz .LBB2_10 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc @@ -768,31 +814,34 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: s_movk_i32 s4, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc ; SDAG-NEXT: s_mov_b32 s5, -1 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB2_7 +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[10:11], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 ; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 -; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 +; SDAG-NEXT: s_xor_b64 s[12:13], s[4:5], exec ; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB2_4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5 @@ -830,12 +879,15 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB2_4: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB2_6 +; SDAG-NEXT: s_xor_b64 s[6:7], s[12:13], exec +; SDAG-NEXT: s_and_b64 s[4:5], s[12:13], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[12:13] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] @@ -849,10 +901,14 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 -; SDAG-NEXT: .LBB2_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB2_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[10:11], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_9 ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 @@ -860,10 +916,10 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 -; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup +; SDAG-NEXT: .LBB2_9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fptosi_f32_to_i128: @@ -872,39 +928,42 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_mov_b32_e32 v4, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, 0 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 23, v[4:5] -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v5 ; GISEL-NEXT: v_bfe_u32 v6, v0, 0, 8 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7f ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB2_10 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_10 ; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 -; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc +; GISEL-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, -1, vcc ; GISEL-NEXT: v_mov_b32_e32 v3, -1 -; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v8, s[4:5], 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] -; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, -1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9] -; GISEL-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_xor_b64 s[14:15], vcc, exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB2_7 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_7 ; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] @@ -963,14 +1022,15 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x96 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 +; GISEL-NEXT: s_xor_b64 s[16:17], vcc, exec ; GISEL-NEXT: v_or_b32_e32 v4, 0x800000, v2 +; GISEL-NEXT: s_and_b64 s[6:7], vcc, -1 ; GISEL-NEXT: v_mov_b32_e32 v5, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB2_4 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] @@ -1004,9 +1064,12 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: s_or_b64 exec, exec, s[16:17] ; GISEL-NEXT: .LBB2_4: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] -; GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GISEL-NEXT: s_xor_b64 s[6:7], s[16:17], exec +; GISEL-NEXT: s_and_b64 s[8:9], s[16:17], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[16:17] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x96, v6 ; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 @@ -1021,11 +1084,14 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10 ; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: .LBB2_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB2_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB2_7: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] -; GISEL-NEXT: s_cbranch_execz .LBB2_9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[14:15], exec +; GISEL-NEXT: s_and_b64 s[8:9], s[14:15], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[14:15] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_9 ; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 @@ -1095,10 +1161,10 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: .LBB2_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: .LBB2_10: ; %fp-to-i-cleanup +; GISEL-NEXT: .LBB2_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB2_10: ; %fp-to-i-cleanup ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptosi float %x to i128 ret i128 %cvt @@ -1111,14 +1177,16 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SDAG-NEXT: s_cbranch_execz .LBB3_10 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc @@ -1126,31 +1194,34 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: s_movk_i32 s4, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc ; SDAG-NEXT: s_mov_b32 s5, -1 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB3_7 +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[10:11], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 ; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 -; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 +; SDAG-NEXT: s_xor_b64 s[12:13], s[4:5], exec ; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB3_4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5 @@ -1188,12 +1259,15 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB3_4: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] -; SDAG-NEXT: s_cbranch_execz .LBB3_6 +; SDAG-NEXT: s_xor_b64 s[6:7], s[12:13], exec +; SDAG-NEXT: s_and_b64 s[4:5], s[12:13], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[12:13] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] @@ -1207,10 +1281,14 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 -; SDAG-NEXT: .LBB3_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB3_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB3_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[10:11], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_9 ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 @@ -1218,10 +1296,10 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 -; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup +; SDAG-NEXT: .LBB3_9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fptoui_f32_to_i128: @@ -1230,39 +1308,42 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_mov_b32_e32 v4, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, 0 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 23, v[4:5] -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v5 ; GISEL-NEXT: v_bfe_u32 v6, v0, 0, 8 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7f ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB3_10 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_10 ; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 -; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc +; GISEL-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, -1, vcc ; GISEL-NEXT: v_mov_b32_e32 v3, -1 -; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v8, s[4:5], 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] -; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, -1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9] -; GISEL-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_xor_b64 s[14:15], vcc, exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB3_7 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_7 ; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] @@ -1321,14 +1402,15 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x96 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 +; GISEL-NEXT: s_xor_b64 s[16:17], vcc, exec ; GISEL-NEXT: v_or_b32_e32 v4, 0x800000, v2 +; GISEL-NEXT: s_and_b64 s[6:7], vcc, -1 ; GISEL-NEXT: v_mov_b32_e32 v5, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB3_4 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] @@ -1362,9 +1444,12 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: s_or_b64 exec, exec, s[16:17] ; GISEL-NEXT: .LBB3_4: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] -; GISEL-NEXT: s_cbranch_execz .LBB3_6 +; GISEL-NEXT: s_xor_b64 s[6:7], s[16:17], exec +; GISEL-NEXT: s_and_b64 s[8:9], s[16:17], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[16:17] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x96, v6 ; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 @@ -1379,11 +1464,14 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10 ; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: .LBB3_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB3_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB3_7: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] -; GISEL-NEXT: s_cbranch_execz .LBB3_9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[14:15], exec +; GISEL-NEXT: s_and_b64 s[8:9], s[14:15], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[14:15] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_9 ; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 @@ -1453,10 +1541,10 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: .LBB3_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: .LBB3_10: ; %fp-to-i-cleanup +; GISEL-NEXT: .LBB3_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB3_10: ; %fp-to-i-cleanup ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptoui float %x to i128 ret i128 %cvt @@ -1497,14 +1585,16 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_bfe_u32 v5, v4, 7, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SDAG-NEXT: s_cbranch_execz .LBB6_10 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB6_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc @@ -1512,29 +1602,32 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: s_movk_i32 s4, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc ; SDAG-NEXT: s_mov_b32 s5, -1 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] -; SDAG-NEXT: v_cmp_lt_i16_e32 vcc, -1, v4 -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB6_7 +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[10:11], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cmp_lt_i16_e32 vcc, -1, v4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB6_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 -; SDAG-NEXT: s_movk_i32 s4, 0x7f -; SDAG-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: s_mov_b64 s[4:5], 0x85 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] +; SDAG-NEXT: s_movk_i32 s6, 0x7f +; SDAG-NEXT: v_and_b32_sdwa v0, v4, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-NEXT: s_xor_b64 s[12:13], s[4:5], exec ; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v6, 0x80, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB6_4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB6_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v5 @@ -1573,11 +1666,15 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB6_4: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] +; SDAG-NEXT: s_xor_b64 s[6:7], s[12:13], exec +; SDAG-NEXT: s_and_b64 s[4:5], s[12:13], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[12:13] +; SDAG-NEXT: s_cbranch_scc0 .LBB6_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] @@ -1589,10 +1686,14 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8 ; SDAG-NEXT: v_mov_b32_e32 v3, v2 -; SDAG-NEXT: ; %bb.6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB6_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB6_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[10:11], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB6_9 ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 @@ -1600,10 +1701,10 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 -; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB6_10: ; %fp-to-i-cleanup +; SDAG-NEXT: .LBB6_9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB6_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fptosi_bf16_to_i128: @@ -1614,37 +1715,40 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GISEL-NEXT: v_bfe_u32 v5, v0, 0, 8 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[5:6], v[1:2] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB6_10 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB6_10 ; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 -; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc +; GISEL-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, -1, vcc ; GISEL-NEXT: v_mov_b32_e32 v3, -1 -; GISEL-NEXT: v_addc_co_u32_e64 v7, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v7, s[4:5], 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] -; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v8, s[4:5], 0, -1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[7:8] -; GISEL-NEXT: v_cmp_lt_i16_e64 s[4:5], -1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[7:8] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_xor_b64 s[14:15], vcc, exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_cmp_lt_i16_e64 s[4:5], -1, v4 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB6_7 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB6_7 ; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] @@ -1695,74 +1799,81 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; GISEL-NEXT: v_or3_b32 v9, v1, v0, 1 -; GISEL-NEXT: v_or3_b32 v10, v11, v0, 0 +; GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; GISEL-NEXT: v_or3_b32 v8, v1, v0, 1 +; GISEL-NEXT: v_or3_b32 v9, v10, v0, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x86 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v2, 0x7f, v4 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[5:6], v[0:1] -; GISEL-NEXT: v_or_b32_e32 v7, 0x80, v2 -; GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GISEL-NEXT: v_and_b32_e32 v2, 0x7f, v4 +; GISEL-NEXT: s_xor_b64 s[16:17], vcc, exec +; GISEL-NEXT: v_or_b32_e32 v6, 0x80, v2 +; GISEL-NEXT: s_and_b64 s[6:7], vcc, -1 +; GISEL-NEXT: v_mov_b32_e32 v7, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB6_4 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB6_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff7a, v5 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[7:8] -; GISEL-NEXT: v_subrev_u32_e32 v4, 64, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 -; GISEL-NEXT: v_lshl_or_b32 v11, v11, 16, v11 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[7:8] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[7:8] -; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v8, v11, 0 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_add_u32_e32 v11, 0xffffff7a, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v11, v[6:7] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 +; GISEL-NEXT: v_lshl_or_b32 v10, v10, 16, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v10, 0 +; GISEL-NEXT: v_subrev_u32_e32 v4, 64, v11 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v11 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v10, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v8, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 ; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v8, v10, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v10, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: s_or_b64 exec, exec, s[16:17] ; GISEL-NEXT: .LBB6_4: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] -; GISEL-NEXT: s_cbranch_execz .LBB6_6 +; GISEL-NEXT: s_xor_b64 s[6:7], s[16:17], exec +; GISEL-NEXT: s_and_b64 s[8:9], s[16:17], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[16:17] +; GISEL-NEXT: s_cbranch_scc0 .LBB6_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x86, v5 ; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 -; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[7:8] +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[6:7] ; GISEL-NEXT: v_lshrrev_b64 v[1:2], v2, 0 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GISEL-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GISEL-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v8 ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GISEL-NEXT: v_mul_i32_i24_e32 v0, v0, v9 +; GISEL-NEXT: v_mul_i32_i24_e32 v0, v0, v8 ; GISEL-NEXT: v_mov_b32_e32 v3, v2 -; GISEL-NEXT: .LBB6_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB6_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB6_7: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] -; GISEL-NEXT: s_cbranch_execz .LBB6_9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[14:15], exec +; GISEL-NEXT: s_and_b64 s[8:9], s[14:15], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[14:15] +; GISEL-NEXT: s_cbranch_scc0 .LBB6_9 ; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 @@ -1832,10 +1943,10 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: .LBB6_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: .LBB6_10: ; %fp-to-i-cleanup +; GISEL-NEXT: .LBB6_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB6_10: ; %fp-to-i-cleanup ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptosi bfloat %x to i128 ret i128 %cvt @@ -1848,14 +1959,16 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_bfe_u32 v5, v4, 7, 8 ; SDAG-NEXT: s_movk_i32 s4, 0x7e +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SDAG-NEXT: s_cbranch_execz .LBB7_10 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB7_10 ; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc @@ -1863,29 +1976,32 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: s_movk_i32 s4, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc ; SDAG-NEXT: s_mov_b32 s5, -1 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] -; SDAG-NEXT: v_cmp_lt_i16_e32 vcc, -1, v4 -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB7_7 +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[10:11], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cmp_lt_i16_e32 vcc, -1, v4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB7_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 -; SDAG-NEXT: s_movk_i32 s4, 0x7f -; SDAG-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-NEXT: s_mov_b64 s[4:5], 0x85 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] +; SDAG-NEXT: s_movk_i32 s6, 0x7f +; SDAG-NEXT: v_and_b32_sdwa v0, v4, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-NEXT: s_xor_b64 s[12:13], s[4:5], exec ; SDAG-NEXT: v_mov_b32_e32 v7, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, vcc +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_or_b32_e32 v6, 0x80, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB7_4 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB7_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else ; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v5 @@ -1924,11 +2040,15 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB7_4: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] +; SDAG-NEXT: s_xor_b64 s[6:7], s[12:13], exec +; SDAG-NEXT: s_and_b64 s[4:5], s[12:13], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[12:13] +; SDAG-NEXT: s_cbranch_scc0 .LBB7_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] @@ -1940,10 +2060,14 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8 ; SDAG-NEXT: v_mov_b32_e32 v3, v2 -; SDAG-NEXT: ; %bb.6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB7_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB7_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[10:11], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB7_9 ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 @@ -1951,10 +2075,10 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 -; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB7_10: ; %fp-to-i-cleanup +; SDAG-NEXT: .LBB7_9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB7_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: fptoui_bf16_to_i128: @@ -1965,37 +2089,40 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], 7, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7f -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GISEL-NEXT: v_bfe_u32 v5, v0, 0, 8 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[5:6], v[1:2] ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB7_10 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB7_10 ; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 -; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc +; GISEL-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, -1, vcc ; GISEL-NEXT: v_mov_b32_e32 v3, -1 -; GISEL-NEXT: v_addc_co_u32_e64 v7, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v7, s[4:5], 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] -; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v8, s[4:5], 0, -1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[7:8] -; GISEL-NEXT: v_cmp_lt_i16_e64 s[4:5], -1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[7:8] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: s_xor_b64 s[14:15], vcc, exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_cmp_lt_i16_e64 s[4:5], -1, v4 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB7_7 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB7_7 ; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] @@ -2046,74 +2173,81 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 ; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; GISEL-NEXT: v_or3_b32 v9, v1, v0, 1 -; GISEL-NEXT: v_or3_b32 v10, v11, v0, 0 +; GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; GISEL-NEXT: v_or3_b32 v8, v1, v0, 1 +; GISEL-NEXT: v_or3_b32 v9, v10, v0, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x86 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v2, 0x7f, v4 ; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[5:6], v[0:1] -; GISEL-NEXT: v_or_b32_e32 v7, 0x80, v2 -; GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GISEL-NEXT: v_and_b32_e32 v2, 0x7f, v4 +; GISEL-NEXT: s_xor_b64 s[16:17], vcc, exec +; GISEL-NEXT: v_or_b32_e32 v6, 0x80, v2 +; GISEL-NEXT: s_and_b64 s[6:7], vcc, -1 +; GISEL-NEXT: v_mov_b32_e32 v7, 0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB7_4 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB7_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff7a, v5 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[7:8] -; GISEL-NEXT: v_subrev_u32_e32 v4, 64, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 -; GISEL-NEXT: v_lshl_or_b32 v11, v11, 16, v11 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[7:8] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[7:8] -; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v8, v11, 0 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_add_u32_e32 v11, 0xffffff7a, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v11, v[6:7] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 +; GISEL-NEXT: v_lshl_or_b32 v10, v10, 16, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v10, 0 +; GISEL-NEXT: v_subrev_u32_e32 v4, 64, v11 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v11 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v10, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v8, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 ; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v8, v10, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v10, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: s_or_b64 exec, exec, s[16:17] ; GISEL-NEXT: .LBB7_4: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] -; GISEL-NEXT: s_cbranch_execz .LBB7_6 +; GISEL-NEXT: s_xor_b64 s[6:7], s[16:17], exec +; GISEL-NEXT: s_and_b64 s[8:9], s[16:17], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[16:17] +; GISEL-NEXT: s_cbranch_scc0 .LBB7_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x86, v5 ; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 -; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[7:8] +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[6:7] ; GISEL-NEXT: v_lshrrev_b64 v[1:2], v2, 0 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GISEL-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GISEL-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v8 ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GISEL-NEXT: v_mul_i32_i24_e32 v0, v0, v9 +; GISEL-NEXT: v_mul_i32_i24_e32 v0, v0, v8 ; GISEL-NEXT: v_mov_b32_e32 v3, v2 -; GISEL-NEXT: .LBB7_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB7_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] ; GISEL-NEXT: .LBB7_7: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] -; GISEL-NEXT: s_cbranch_execz .LBB7_9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[14:15], exec +; GISEL-NEXT: s_and_b64 s[8:9], s[14:15], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[14:15] +; GISEL-NEXT: s_cbranch_scc0 .LBB7_9 ; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 @@ -2183,10 +2317,10 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 ; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GISEL-NEXT: .LBB7_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-NEXT: .LBB7_10: ; %fp-to-i-cleanup +; GISEL-NEXT: .LBB7_9: ; %Flow3 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB7_10: ; %fp-to-i-cleanup ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = fptoui bfloat %x to i128 ret i128 %cvt diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index eeddc2211ea97..c757f9a0f9d5f 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca,+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 3b2f15c8340a6..ab74285d906ec 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -104,35 +104,42 @@ define void @i1_arg_i1_use(i1 %arg) #0 { ; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0 ; CIGFX89-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CIGFX89-NEXT: s_xor_b64 s[6:7], vcc, -1 -; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] -; CIGFX89-NEXT: s_cbranch_execz .LBB3_2 +; CIGFX89-NEXT: s_and_b64 s[6:7], s[6:7], exec +; CIGFX89-NEXT: s_mov_b64 s[4:5], exec +; CIGFX89-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CIGFX89-NEXT: s_cmov_b64 exec, s[6:7] +; CIGFX89-NEXT: s_cbranch_scc0 .LBB3_2 ; CIGFX89-NEXT: ; %bb.1: ; %bb1 ; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 ; CIGFX89-NEXT: s_mov_b32 s6, -1 ; CIGFX89-NEXT: v_mov_b32_e32 v0, 0 ; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CIGFX89-NEXT: s_waitcnt vmcnt(0) -; CIGFX89-NEXT: .LBB3_2: ; %bb2 ; CIGFX89-NEXT: s_or_b64 exec, exec, s[4:5] +; CIGFX89-NEXT: .LBB3_2: ; %bb2 ; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: i1_arg_i1_use: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11-NEXT: s_xor_b32 s1, vcc_lo, -1 -; GFX11-NEXT: s_and_saveexec_b32 s0, s1 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_and_b32 s1, s1, exec_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s3, s1, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, s1 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX11-NEXT: ; %bb.1: ; %bb1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: .LBB3_2: ; %bb2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: .LBB3_2: ; %bb2 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: br i1 %arg, label %bb2, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index ee0910b21f024..dcd41504c98fa 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -198,14 +198,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %1 ; GFX908-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3.Flow: ; GFX908-NEXT: successors: %bb.4(0x80000000) ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4 (%ir-block.37): - ; GFX908-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw @@ -260,14 +260,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %1 ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: bb.3.Flow: ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000) ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX90A_GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: {{ $}} ; GFX90A_GFX940-NEXT: bb.4 (%ir-block.37): - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A_GFX940-NEXT: S_ENDPGM 0 ; ; GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw @@ -314,14 +314,14 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11_GFX12-NEXT: {{ $}} ; GFX11_GFX12-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], %1, [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11_GFX12-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11_GFX12-NEXT: {{ $}} ; GFX11_GFX12-NEXT: bb.3.Flow: ; GFX11_GFX12-NEXT: successors: %bb.4(0x80000000) ; GFX11_GFX12-NEXT: {{ $}} - ; GFX11_GFX12-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX11_GFX12-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11_GFX12-NEXT: {{ $}} ; GFX11_GFX12-NEXT: bb.4 (%ir-block.30): - ; GFX11_GFX12-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11_GFX12-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic ret void diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index 3454e9d1019e5..c5f586802874f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -210,23 +210,23 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY %2 ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY5]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.4 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.3.Flow: ; GFX11-NEXT: successors: %bb.5(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %7, %bb.4 - ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.5 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.4 (%ir-block.39): ; GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 - ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX11-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.3 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.5 (%ir-block.47): diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll index 0612383c3f90b..677471b526a69 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx803 -d - | FileCheck -check-prefix=DISASSEMBLY-VI %s diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll index d10e049444d68..1ee360ddcca08 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -12,9 +12,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: s_mov_b64 s[2:3], exec +; GFX900-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX900-NEXT: ; implicit-def: $vgpr1 -; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX900-NEXT: s_cbranch_execz .LBB0_4 +; GFX900-NEXT: s_cmov_b64 exec, vcc +; GFX900-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5] @@ -35,12 +37,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX900-NEXT: s_cbranch_execnz .LBB0_2 +; GFX900-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX900-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX900-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX900-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX900-NEXT: ; %bb.3: ; %Flow -; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX900-NEXT: .LBB0_4: ; %Flow1 ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX900-NEXT: .LBB0_4: ; GFX900-NEXT: v_readfirstlane_b32 s0, v1 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX900-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -53,9 +56,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX908-NEXT: ; implicit-def: $vgpr1 -; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX908-NEXT: s_cbranch_execz .LBB0_4 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5] @@ -76,12 +81,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB0_2 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX908-NEXT: ; %bb.3: ; %Flow -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: .LBB0_4: ; %Flow1 ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX908-NEXT: .LBB0_4: ; GFX908-NEXT: v_readfirstlane_b32 s0, v1 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX908-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -94,9 +100,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB0_4 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s7, s[4:5] @@ -119,12 +127,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_2 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: .LBB0_4: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: .LBB0_4: ; GFX90A-NEXT: v_readfirstlane_b32 s0, v1 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX90A-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -134,12 +143,14 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 ; GFX10-LABEL: global_atomic_fadd_ret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s4, exec_lo -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_mov_b32 s2, exec_lo ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB0_4 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -160,12 +171,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: s_cbranch_execnz .LBB0_2 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s3 +; GFX10-NEXT: s_and_b32 s5, s4, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s3 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX10-NEXT: ; %bb.3: ; %Flow -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: .LBB0_4: ; %Flow1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: .LBB0_4: ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -175,12 +187,14 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 ; GFX11-LABEL: global_atomic_fadd_ret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s4, exec_lo -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB0_4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -200,12 +214,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-NEXT: s_or_b32 s3, vcc_lo, s3 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 -; GFX11-NEXT: s_cbranch_execnz .LBB0_2 +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s3 +; GFX11-NEXT: s_and_b32 s5, s4, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s3 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX11-NEXT: ; %bb.3: ; %Flow -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11-NEXT: .LBB0_4: ; %Flow1 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: .LBB0_4: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0 @@ -226,9 +241,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: s_mov_b64 s[2:3], exec +; GFX900-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX900-NEXT: ; implicit-def: $vgpr1 -; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX900-NEXT: s_cbranch_execz .LBB1_4 +; GFX900-NEXT: s_cmov_b64 exec, vcc +; GFX900-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5] @@ -249,12 +266,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX900-NEXT: s_cbranch_execnz .LBB1_2 +; GFX900-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX900-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX900-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX900-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %Flow -; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX900-NEXT: .LBB1_4: ; %Flow1 ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX900-NEXT: .LBB1_4: ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX900-NEXT: v_readfirstlane_b32 s0, v1 ; GFX900-NEXT: v_mul_f32_e32 v0, 4.0, v0 @@ -268,9 +286,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX908-NEXT: ; implicit-def: $vgpr1 -; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX908-NEXT: s_cbranch_execz .LBB1_4 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5] @@ -291,12 +311,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB1_2 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX908-NEXT: ; %bb.3: ; %Flow -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: .LBB1_4: ; %Flow1 ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX908-NEXT: .LBB1_4: ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX908-NEXT: v_readfirstlane_b32 s0, v1 ; GFX908-NEXT: v_mul_f32_e32 v0, 4.0, v0 @@ -310,9 +331,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB1_2 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -323,8 +346,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v2, s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB1_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: .LBB1_2: ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s0, v1 ; GFX90A-NEXT: v_mul_f32_e32 v0, 4.0, v0 @@ -335,12 +358,14 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s4, exec_lo -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_mov_b32 s2, exec_lo ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB1_4 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -361,12 +386,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: s_cbranch_execnz .LBB1_2 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s3 +; GFX10-NEXT: s_and_b32 s5, s4, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s3 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX10-NEXT: ; %bb.3: ; %Flow -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: .LBB1_4: ; %Flow1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: .LBB1_4: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 4.0, v0 @@ -380,8 +406,10 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -392,8 +420,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: .LBB1_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: .LBB1_2: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0 @@ -414,8 +442,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) # ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX900-NEXT: s_cbranch_execz .LBB2_3 +; GFX900-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX900-NEXT: s_cmov_b64 exec, vcc +; GFX900-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -435,9 +464,11 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) # ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX900-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX900-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz .LBB2_2 +; GFX900-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX900-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX900-NEXT: .LBB2_3: ; GFX900-NEXT: s_endpgm ; @@ -447,8 +478,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) # ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_cbranch_execz .LBB2_2 +; GFX908-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -468,8 +500,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) # ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB2_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -489,8 +522,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) # ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB2_3 +; GFX10-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -511,18 +545,21 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) # ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB2_2 +; GFX10-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX10-NEXT: s_and_b32 s4, s3, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX10-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX10-NEXT: .LBB2_3: ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_noret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_mov_b32 s3, exec_lo ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2 @@ -546,8 +583,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX900-NEXT: s_cbranch_execz .LBB3_3 +; GFX900-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX900-NEXT: s_cmov_b64 exec, vcc +; GFX900-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -567,9 +605,11 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX900-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX900-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz .LBB3_2 +; GFX900-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX900-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX900-NEXT: .LBB3_3: ; GFX900-NEXT: s_endpgm ; @@ -579,8 +619,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_cbranch_execz .LBB3_2 +; GFX908-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -600,8 +641,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB3_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -621,8 +663,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB3_3 +; GFX10-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -643,18 +686,21 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB3_2 +; GFX10-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX10-NEXT: s_and_b32 s4, s3, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX10-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX10-NEXT: .LBB3_3: ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_mov_b32 s3, exec_lo ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2 @@ -678,9 +724,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: s_mov_b64 s[2:3], exec +; GFX900-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX900-NEXT: ; implicit-def: $vgpr1 -; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX900-NEXT: s_cbranch_execz .LBB4_4 +; GFX900-NEXT: s_cmov_b64 exec, vcc +; GFX900-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5] @@ -701,12 +749,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX900-NEXT: s_cbranch_execnz .LBB4_2 +; GFX900-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX900-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX900-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX900-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX900-NEXT: ; %bb.3: ; %Flow -; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX900-NEXT: .LBB4_4: ; %Flow1 ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX900-NEXT: .LBB4_4: ; GFX900-NEXT: v_readfirstlane_b32 s0, v1 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX900-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -719,9 +768,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX908-NEXT: ; implicit-def: $vgpr1 -; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX908-NEXT: s_cbranch_execz .LBB4_4 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5] @@ -742,12 +793,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB4_2 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX908-NEXT: ; %bb.3: ; %Flow -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: .LBB4_4: ; %Flow1 ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX908-NEXT: .LBB4_4: ; GFX908-NEXT: v_readfirstlane_b32 s0, v1 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX908-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -760,9 +812,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB4_2 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -773,8 +827,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v2, s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB4_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: .LBB4_2: ; GFX90A-NEXT: v_readfirstlane_b32 s0, v1 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX90A-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -784,12 +838,14 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; GFX10-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s4, exec_lo -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_mov_b32 s2, exec_lo ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB4_4 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -810,12 +866,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: s_cbranch_execnz .LBB4_2 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s3 +; GFX10-NEXT: s_and_b32 s5, s4, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s3 +; GFX10-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX10-NEXT: ; %bb.3: ; %Flow -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: .LBB4_4: ; %Flow1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: .LBB4_4: ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -828,8 +885,10 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -840,8 +899,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: .LBB4_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: .LBB4_2: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0 @@ -862,9 +921,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: s_mov_b64 s[2:3], exec +; GFX900-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX900-NEXT: ; implicit-def: $vgpr1 -; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX900-NEXT: s_cbranch_execz .LBB5_4 +; GFX900-NEXT: s_cmov_b64 exec, vcc +; GFX900-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5] @@ -885,12 +946,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX900-NEXT: s_cbranch_execnz .LBB5_2 +; GFX900-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX900-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX900-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX900-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX900-NEXT: ; %bb.3: ; %Flow -; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX900-NEXT: .LBB5_4: ; %Flow1 ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX900-NEXT: .LBB5_4: ; GFX900-NEXT: v_readfirstlane_b32 s0, v1 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX900-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -903,9 +965,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX908-NEXT: ; implicit-def: $vgpr1 -; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX908-NEXT: s_cbranch_execz .LBB5_4 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5] @@ -926,12 +990,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_2 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX908-NEXT: ; %bb.3: ; %Flow -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: .LBB5_4: ; %Flow1 ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX908-NEXT: .LBB5_4: ; GFX908-NEXT: v_readfirstlane_b32 s0, v1 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX908-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -944,9 +1009,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB5_4 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s7, s[4:5] @@ -969,12 +1036,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_2 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: .LBB5_4: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: .LBB5_4: ; GFX90A-NEXT: v_readfirstlane_b32 s0, v1 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX90A-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -984,12 +1052,14 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p ; GFX10-LABEL: global_atomic_fadd_ret_f32_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s4, exec_lo -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_mov_b32 s2, exec_lo ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB5_4 +; GFX10-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1010,12 +1080,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: s_cbranch_execnz .LBB5_2 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s3 +; GFX10-NEXT: s_and_b32 s5, s4, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s3 +; GFX10-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX10-NEXT: ; %bb.3: ; %Flow -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: .LBB5_4: ; %Flow1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: .LBB5_4: ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -1025,12 +1096,14 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p ; GFX11-LABEL: global_atomic_fadd_ret_f32_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s4, exec_lo -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB5_4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1050,12 +1123,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX11-NEXT: s_or_b32 s3, vcc_lo, s3 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 -; GFX11-NEXT: s_cbranch_execnz .LBB5_2 +; GFX11-NEXT: s_and_not1_b32 s4, exec_lo, s3 +; GFX11-NEXT: s_and_b32 s5, s4, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s4, s3 +; GFX11-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX11-NEXT: ; %bb.3: ; %Flow -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11-NEXT: .LBB5_4: ; %Flow1 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: .LBB5_4: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0 @@ -1076,9 +1150,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 ; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GCN-NEXT: s_cbranch_execz .LBB6_4 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB6_4 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_bcnt1_i32_b64 s7, s[4:5] @@ -1099,12 +1175,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_2 +; GCN-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB6_2 ; GCN-NEXT: ; %bb.3: ; %Flow -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: .LBB6_4: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: .LBB6_4: ; GCN-NEXT: v_readfirstlane_b32 s0, v1 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GCN-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -1117,9 +1194,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX11-NEXT: ; implicit-def: $vgpr1 -; GFX11-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_bcnt1_i32_b64 s7, s[4:5] @@ -1140,12 +1219,13 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GFX11-NEXT: buffer_wbinvl1_vol ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX11-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX11-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX11-NEXT: s_cbranch_execnz .LBB6_2 +; GFX11-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX11-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX11-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX11-NEXT: ; %bb.3: ; %Flow -; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX11-NEXT: .LBB6_4: ; %Flow1 ; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: .LBB6_4: ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_mad_f32 v0, v0, 4.0, s0 @@ -1163,8 +1243,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB7_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1184,8 +1265,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1209,8 +1291,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX900-NEXT: s_cbranch_execz .LBB8_3 +; GFX900-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX900-NEXT: s_cmov_b64 exec, vcc +; GFX900-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -1230,9 +1313,11 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX900-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX900-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz .LBB8_2 +; GFX900-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX900-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX900-NEXT: .LBB8_3: ; GFX900-NEXT: s_endpgm ; @@ -1242,8 +1327,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_cbranch_execz .LBB8_3 +; GFX908-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -1263,9 +1349,11 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX908-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_cbranch_execnz .LBB8_2 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX908-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX908-NEXT: .LBB8_3: ; GFX908-NEXT: s_endpgm ; @@ -1275,8 +1363,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB8_3 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -1296,9 +1385,11 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX90A-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_2 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX90A-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX90A-NEXT: .LBB8_3: ; GFX90A-NEXT: s_endpgm ; @@ -1308,8 +1399,9 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB8_3 +; GFX10-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -1330,8 +1422,10 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB8_2 +; GFX10-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX10-NEXT: s_and_b32 s4, s3, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX10-NEXT: .LBB8_3: ; GFX10-NEXT: s_endpgm ; @@ -1340,9 +1434,10 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX11-NEXT: s_mov_b32 s3, exec_lo ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX11-NEXT: s_mov_b32 s4, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB8_3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -1362,8 +1457,10 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB8_2 +; GFX11-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX11-NEXT: s_and_b32 s4, s3, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX11-NEXT: .LBB8_3: ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst @@ -1377,8 +1474,9 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 { ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX900-NEXT: s_cbranch_execz .LBB9_3 +; GFX900-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX900-NEXT: s_cmov_b64 exec, vcc +; GFX900-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX900-NEXT: ; %bb.1: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -1398,9 +1496,11 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 { ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX900-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX900-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz .LBB9_2 +; GFX900-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX900-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX900-NEXT: .LBB9_3: ; GFX900-NEXT: s_endpgm ; @@ -1410,8 +1510,9 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 { ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_cbranch_execz .LBB9_2 +; GFX908-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX908-NEXT: s_cmov_b64 exec, vcc +; GFX908-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1430,8 +1531,9 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 { ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB9_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1450,8 +1552,9 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 { ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB9_3 +; GFX10-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -1471,18 +1574,21 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_execnz .LBB9_2 +; GFX10-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX10-NEXT: s_and_b32 s4, s3, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX10-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX10-NEXT: .LBB9_3: ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: infer_as_before_atomic: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_mov_b32 s3, exec_lo ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2 @@ -1535,10 +1641,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX900-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX900-NEXT: s_cbranch_execnz .LBB10_1 +; GFX900-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GFX900-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX900-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GFX900-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX900-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX900-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX900-NEXT: global_store_short v[0:1], v0, off ; GFX900-NEXT: s_endpgm @@ -1576,10 +1683,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GFX908-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX908-NEXT: global_store_short v[0:1], v0, off ; GFX908-NEXT: s_endpgm @@ -1617,10 +1725,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GFX90A-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX90A-NEXT: global_store_short v[0:1], v0, off ; GFX90A-NEXT: s_endpgm @@ -1636,10 +1745,10 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-NEXT: s_lshl_b32 s2, s2, 3 ; GFX10-NEXT: s_lshl_b32 s4, 0xffff, s2 -; GFX10-NEXT: s_not_b32 s4, s4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_not_b32 s3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -1651,17 +1760,18 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v1, v2, s4, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, s3, v1 ; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_and_b32 s6, s5, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s2, v1 ; GFX10-NEXT: global_store_short v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1677,10 +1787,10 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX11-NEXT: s_lshl_b32 s2, s2, 3 ; GFX11-NEXT: s_lshl_b32 s4, 0xffff, s2 -; GFX11-NEXT: s_not_b32 s4, s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_not_b32 s3, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1695,17 +1805,18 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, s2, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s4, v1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s3, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s3, vcc_lo, s3 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_and_b32 s6, s5, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s2, v1 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 @@ -1750,10 +1861,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX900-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX900-NEXT: s_cbranch_execnz .LBB11_1 +; GFX900-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GFX900-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX900-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GFX900-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX900-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX900-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX900-NEXT: global_store_short v[0:1], v0, off ; GFX900-NEXT: s_endpgm @@ -1791,10 +1903,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX908-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GFX908-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX908-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX908-NEXT: global_store_short v[0:1], v0, off ; GFX908-NEXT: s_endpgm @@ -1834,10 +1947,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GFX90A-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX90A-NEXT: global_store_short v[0:1], v0, off ; GFX90A-NEXT: s_endpgm @@ -1853,10 +1967,10 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-NEXT: s_lshl_b32 s2, s2, 3 ; GFX10-NEXT: s_lshl_b32 s4, 0xffff, s2 -; GFX10-NEXT: s_not_b32 s4, s4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_not_b32 s3, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v2, v1 @@ -1868,17 +1982,18 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v1, v2, s4, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, s3, v1 ; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_and_b32 s6, s5, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s2, v1 ; GFX10-NEXT: global_store_short v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1894,10 +2009,10 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX11-NEXT: s_lshl_b32 s2, s2, 3 ; GFX11-NEXT: s_lshl_b32 s4, 0xffff, s2 -; GFX11-NEXT: s_not_b32 s4, s4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_not_b32 s3, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1912,17 +2027,18 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, s2, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s4, v1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s3, v1 ; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s3, vcc_lo, s3 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_and_not1_b32 s5, exec_lo, s4 +; GFX11-NEXT: s_and_b32 s6, s5, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s2, v1 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 @@ -1949,10 +2065,11 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX900-NEXT: s_cbranch_execnz .LBB12_1 +; GFX900-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX900-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX900-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX900-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1971,10 +2088,11 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -1993,10 +2111,11 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2017,10 +2136,11 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_and_b32 s6, s5, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2041,10 +2161,11 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_and_b32 s2, s1, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst @@ -2066,11 +2187,12 @@ define void @global_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %v ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX900-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX900-NEXT: s_cbranch_execnz .LBB13_1 +; GFX900-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX900-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_atomic_fadd_noret_v2f16: @@ -2087,11 +2209,12 @@ define void @global_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %v ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX908-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX908-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_atomic_fadd_noret_v2f16: @@ -2108,11 +2231,12 @@ define void @global_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %v ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_atomic_fadd_noret_v2f16: @@ -2132,10 +2256,11 @@ define void @global_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %v ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_and_b32 s6, s5, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_atomic_fadd_noret_v2f16: @@ -2155,10 +2280,11 @@ define void @global_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %v ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_and_b32 s2, s1, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret void @@ -2198,10 +2324,11 @@ define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX900-NEXT: s_cbranch_execnz .LBB14_1 +; GFX900-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX900-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GFX900-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX900-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2238,10 +2365,11 @@ define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -2278,10 +2406,11 @@ define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2318,10 +2447,11 @@ define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s6, s4, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2360,11 +2490,12 @@ define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_and_b32 s2, s0, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst @@ -2404,11 +2535,12 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX900-NEXT: s_and_b64 s[10:11], s[4:5], -1 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX900-NEXT: s_cbranch_execnz .LBB15_1 +; GFX900-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX900-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_atomic_fadd_noret_v2bf16: @@ -2443,11 +2575,12 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX908-NEXT: s_and_b64 s[10:11], s[4:5], -1 ; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX908-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: global_atomic_fadd_noret_v2bf16: @@ -2482,11 +2615,12 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX90A-NEXT: s_and_b64 s[10:11], s[4:5], -1 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX90A-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_atomic_fadd_noret_v2bf16: @@ -2522,10 +2656,11 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5 +; GFX10-NEXT: s_and_b32 s6, s4, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5 +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_atomic_fadd_noret_v2bf16: @@ -2563,11 +2698,12 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX11-NEXT: s_and_b32 s2, s0, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll index 6b4a6381d954c..8fbaa0b945622 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -28,10 +28,11 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_max_saddr_i32_rtn: @@ -52,10 +53,11 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_max_saddr_i32_rtn: @@ -79,11 +81,12 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -111,10 +114,11 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_max_saddr_i32_rtn_neg128: @@ -135,10 +139,11 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_max_saddr_i32_rtn_neg128: @@ -162,11 +167,12 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -193,9 +199,11 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -216,8 +224,10 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -240,9 +250,11 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB2_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -268,9 +280,11 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB3_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -291,8 +305,10 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -315,9 +331,11 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -348,10 +366,11 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -376,10 +395,11 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB4_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -407,11 +427,12 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -443,10 +464,11 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -471,10 +493,11 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -502,11 +525,12 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -538,9 +562,11 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -564,8 +590,10 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -591,9 +619,11 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -622,9 +652,11 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -648,8 +680,10 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -675,9 +709,11 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -710,10 +746,11 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_min_saddr_i32_rtn: @@ -734,10 +771,11 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_min_saddr_i32_rtn: @@ -761,11 +799,12 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -793,10 +832,11 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_min_saddr_i32_rtn_neg128: @@ -817,10 +857,11 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_min_saddr_i32_rtn_neg128: @@ -844,11 +885,12 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -875,9 +917,11 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -898,8 +942,10 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -922,9 +968,11 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -950,9 +998,11 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -973,8 +1023,10 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -997,9 +1049,11 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1030,10 +1084,11 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -1058,10 +1113,11 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -1089,11 +1145,12 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -1125,10 +1182,11 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB13_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -1153,10 +1211,11 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -1184,11 +1243,12 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -1220,9 +1280,11 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -1246,8 +1308,10 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -1273,9 +1337,11 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1304,9 +1370,11 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -1330,8 +1398,10 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -1357,9 +1427,11 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1392,10 +1464,11 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB16_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umax_saddr_i32_rtn: @@ -1416,10 +1489,11 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umax_saddr_i32_rtn: @@ -1443,11 +1517,12 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1475,10 +1550,11 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128: @@ -1499,10 +1575,11 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128: @@ -1526,11 +1603,12 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -1557,9 +1635,11 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB18_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -1580,8 +1660,10 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -1604,9 +1686,11 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1632,9 +1716,11 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB19_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -1655,8 +1741,10 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -1679,9 +1767,11 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1712,10 +1802,11 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB20_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -1740,10 +1831,11 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -1771,11 +1863,12 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -1807,10 +1900,11 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB21_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -1835,10 +1929,11 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -1866,11 +1961,12 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -1902,9 +1998,11 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB22_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -1928,8 +2026,10 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -1955,9 +2055,11 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1986,9 +2088,11 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB23_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -2012,8 +2116,10 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -2039,9 +2145,11 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -2074,10 +2182,11 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB24_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umin_saddr_i32_rtn: @@ -2098,10 +2207,11 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umin_saddr_i32_rtn: @@ -2125,11 +2235,12 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2157,10 +2268,11 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB25_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128: @@ -2181,10 +2293,11 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128: @@ -2208,11 +2321,12 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2239,9 +2353,11 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB26_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -2262,8 +2378,10 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -2286,9 +2404,11 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -2314,9 +2434,11 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB27_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -2337,8 +2459,10 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -2361,9 +2485,11 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -2394,10 +2520,11 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB28_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -2422,10 +2549,11 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -2453,11 +2581,12 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -2489,10 +2618,11 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB29_1 +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: ; return to shader part epilog @@ -2517,10 +2647,11 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: ; return to shader part epilog @@ -2548,11 +2679,12 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog @@ -2584,9 +2716,11 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -2610,8 +2744,10 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -2637,9 +2773,11 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -2668,9 +2806,11 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB31_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -2694,8 +2834,10 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX10-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; @@ -2721,9 +2863,11 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: v_mov_b32_e32 v5, v3 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX11-NEXT: s_cbranch_scc1 .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 516c92f1640ea..5e30cb32b94c4 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -2182,11 +2182,12 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB51_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB51_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2205,11 +2206,12 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB51_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB51_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret: @@ -2227,11 +2229,12 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB51_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -2260,11 +2263,12 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB52_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB52_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2285,11 +2289,12 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB52_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB52_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret_offset: @@ -2307,11 +2312,12 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB52_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst @@ -2342,10 +2348,11 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB53_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB53_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2366,10 +2373,11 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB53_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2389,10 +2397,11 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst @@ -2423,10 +2432,11 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB54_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB54_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2449,10 +2459,11 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB54_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB54_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_ret_offset: @@ -2471,10 +2482,11 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB54_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -2510,11 +2522,12 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB55_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB55_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2540,11 +2553,12 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB55_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret_scalar: @@ -2563,11 +2577,12 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB55_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -2601,11 +2616,12 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB56_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB56_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2633,11 +2649,12 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB56_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB56_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret_offset_scalar: @@ -2656,11 +2673,12 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB56_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst @@ -2696,10 +2714,11 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB57_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB57_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2728,10 +2747,11 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB57_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB57_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_ret_scalar: @@ -2751,10 +2771,11 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB57_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -2789,10 +2810,11 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB58_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB58_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2821,10 +2843,11 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB58_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB58_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_ret_offset_scalar: @@ -2844,10 +2867,11 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB58_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst @@ -2877,11 +2901,12 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB59_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB59_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2902,11 +2927,12 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB59_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB59_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory: @@ -2924,11 +2950,12 @@ define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2959,10 +2986,11 @@ define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB60_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB60_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2985,10 +3013,11 @@ define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB60_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB60_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory: @@ -3007,10 +3036,11 @@ define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB60_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB60_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -3964,11 +3994,12 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB83_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB83_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3986,11 +4017,12 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB83_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB83_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret: @@ -4007,11 +4039,12 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB83_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB83_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -4039,11 +4072,12 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB84_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4063,11 +4097,12 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB84_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB84_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret_offset: @@ -4084,11 +4119,12 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB84_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB84_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst @@ -4118,10 +4154,11 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB85_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4141,10 +4178,11 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB85_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4163,10 +4201,11 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB85_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB85_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst @@ -4196,10 +4235,11 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB86_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4221,10 +4261,11 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB86_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB86_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_ret_offset: @@ -4242,10 +4283,11 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB86_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB86_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4280,11 +4322,12 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB87_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB87_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4309,11 +4352,12 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB87_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret_scalar: @@ -4331,11 +4375,12 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB87_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB87_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -4368,11 +4413,12 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB88_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4399,11 +4445,12 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB88_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB88_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret_offset_scalar: @@ -4421,11 +4468,12 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB88_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB88_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst @@ -4460,10 +4508,11 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB89_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB89_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4491,10 +4540,11 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB89_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB89_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_ret_scalar: @@ -4513,10 +4563,11 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB89_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -4550,10 +4601,11 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB90_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB90_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4581,10 +4633,11 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB90_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_ret_offset_scalar: @@ -4603,10 +4656,11 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB90_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB90_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst @@ -4640,9 +4694,11 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; SI-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB91_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB91_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -4671,9 +4727,11 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; VI-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB91_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4699,9 +4757,11 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB91_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -4739,11 +4799,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[0:1] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB92_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB92_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -4778,10 +4839,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB92_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB92_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -4811,10 +4873,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB92_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB92_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm @@ -4853,9 +4916,11 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; SI-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB93_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB93_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -4882,9 +4947,11 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; VI-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB93_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB93_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4910,9 +4977,11 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB93_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB93_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -4949,11 +5018,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[0:1] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB94_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB94_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -4986,10 +5056,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB94_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB94_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -5019,10 +5090,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB94_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB94_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm @@ -5055,11 +5127,12 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB95_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB95_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5079,11 +5152,12 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB95_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB95_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory: @@ -5100,11 +5174,12 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB95_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB95_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -5134,10 +5209,11 @@ define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB96_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB96_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5159,10 +5235,11 @@ define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB96_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB96_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory: @@ -5180,10 +5257,11 @@ define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB96_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB96_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -5217,11 +5295,12 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB97_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB97_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5239,11 +5318,12 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB97_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB97_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret: @@ -5260,11 +5340,12 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB97_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB97_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -5292,11 +5373,12 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB98_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB98_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5316,11 +5398,12 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB98_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB98_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret_offset: @@ -5337,11 +5420,12 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB98_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB98_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst @@ -5371,10 +5455,11 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB99_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5394,10 +5479,11 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB99_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB99_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5416,10 +5502,11 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB99_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB99_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst @@ -5449,10 +5536,11 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB100_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5474,10 +5562,11 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB100_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB100_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_ret_offset: @@ -5495,10 +5584,11 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB100_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB100_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5533,11 +5623,12 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB101_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -5562,11 +5653,12 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB101_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret_scalar: @@ -5584,11 +5676,12 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB101_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB101_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -5621,11 +5714,12 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB102_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -5652,11 +5746,12 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB102_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB102_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret_offset_scalar: @@ -5674,11 +5769,12 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB102_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB102_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst @@ -5713,10 +5809,11 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB103_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB103_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -5744,10 +5841,11 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB103_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB103_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_ret_scalar: @@ -5766,10 +5864,11 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB103_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -5803,10 +5902,11 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB104_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB104_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -5834,10 +5934,11 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB104_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_ret_offset_scalar: @@ -5856,10 +5957,11 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB104_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst @@ -5893,9 +5995,11 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; SI-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB105_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB105_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -5924,9 +6028,11 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; VI-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB105_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB105_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -5952,9 +6058,11 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB105_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB105_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -5992,11 +6100,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[0:1] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB106_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB106_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -6031,10 +6140,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB106_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB106_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -6064,10 +6174,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB106_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB106_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm @@ -6107,11 +6218,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[0:1] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB107_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB107_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -6144,10 +6256,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB107_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB107_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -6177,10 +6290,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB107_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB107_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm @@ -6213,11 +6327,12 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB108_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB108_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6237,11 +6352,12 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB108_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB108_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory: @@ -6258,11 +6374,12 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB108_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6292,10 +6409,11 @@ define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB109_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB109_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6317,10 +6435,11 @@ define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB109_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB109_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory: @@ -6338,10 +6457,11 @@ define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB109_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -6375,11 +6495,12 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB110_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB110_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6397,11 +6518,12 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB110_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB110_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret: @@ -6418,11 +6540,12 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB110_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB110_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -6450,11 +6573,12 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB111_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6474,11 +6598,12 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB111_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB111_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret_offset: @@ -6495,11 +6620,12 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB111_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB111_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst @@ -6529,10 +6655,11 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB112_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB112_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6552,10 +6679,11 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB112_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB112_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6574,10 +6702,11 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB112_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB112_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst @@ -6607,10 +6736,11 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB113_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB113_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6632,10 +6762,11 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB113_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB113_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_ret_offset: @@ -6653,10 +6784,11 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB113_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB113_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6691,11 +6823,12 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB114_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB114_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6720,11 +6853,12 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB114_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret_scalar: @@ -6742,11 +6876,12 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB114_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB114_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -6779,11 +6914,12 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB115_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB115_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6810,11 +6946,12 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB115_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB115_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret_offset_scalar: @@ -6832,11 +6969,12 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB115_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB115_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst @@ -6871,10 +7009,11 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB116_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB116_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6902,10 +7041,11 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB116_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB116_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_ret_scalar: @@ -6924,10 +7064,11 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB116_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB116_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -6961,10 +7102,11 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB117_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB117_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6992,10 +7134,11 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB117_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB117_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar: @@ -7014,10 +7157,11 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB117_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB117_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst @@ -7046,11 +7190,12 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB118_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB118_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7070,11 +7215,12 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB118_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB118_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory: @@ -7091,11 +7237,12 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB118_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB118_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7125,10 +7272,11 @@ define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB119_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB119_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7150,10 +7298,11 @@ define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB119_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB119_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory: @@ -7171,10 +7320,11 @@ define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB119_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB119_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -7208,11 +7358,12 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB120_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB120_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7230,11 +7381,12 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB120_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB120_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret: @@ -7251,11 +7403,12 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB120_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB120_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -7283,11 +7436,12 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB121_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7307,11 +7461,12 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB121_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB121_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret_offset: @@ -7328,11 +7483,12 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB121_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB121_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst @@ -7362,10 +7518,11 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB122_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB122_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7385,10 +7542,11 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB122_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB122_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7407,10 +7565,11 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB122_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB122_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst @@ -7440,10 +7599,11 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB123_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB123_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7465,10 +7625,11 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB123_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB123_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_ret_offset: @@ -7486,10 +7647,11 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB123_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB123_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7524,11 +7686,12 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB124_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB124_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7553,11 +7716,12 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB124_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret_scalar: @@ -7575,11 +7739,12 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB124_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB124_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -7612,11 +7777,12 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB125_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 ; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7643,11 +7809,12 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB125_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB125_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret_offset_scalar: @@ -7665,11 +7832,12 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB125_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB125_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst @@ -7704,10 +7872,11 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB126_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB126_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7735,10 +7904,11 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB126_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB126_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_ret_scalar: @@ -7757,10 +7927,11 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB126_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -7794,10 +7965,11 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB127_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB127_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 ; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7825,10 +7997,11 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB127_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB127_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_ret_offset_scalar: @@ -7847,10 +8020,11 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB127_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB127_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst @@ -7884,9 +8058,11 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; SI-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB128_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB128_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -7915,9 +8091,11 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; VI-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB128_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -7943,9 +8121,11 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB128_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -7983,11 +8163,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[0:1] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB129_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB129_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -8022,10 +8203,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB129_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB129_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -8055,10 +8237,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB129_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB129_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm @@ -8093,9 +8276,11 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; SI-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB130_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB130_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -8118,9 +8303,11 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; VI-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB130_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB130_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -8142,9 +8329,11 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB130_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB130_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -8180,11 +8369,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[0:1] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB131_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB131_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -8217,10 +8407,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB131_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB131_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -8250,10 +8441,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB131_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB131_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm @@ -8286,11 +8478,12 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB132_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB132_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8310,11 +8503,12 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB132_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB132_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory: @@ -8331,11 +8525,12 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB132_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB132_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -8365,10 +8560,11 @@ define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB133_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB133_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8390,10 +8586,11 @@ define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB133_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB133_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory: @@ -8411,10 +8608,11 @@ define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB133_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB133_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index cafd35afea6eb..6699cafaf4637 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -2224,12 +2224,13 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB50_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB50_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2251,11 +2252,12 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB50_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB50_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret: @@ -2276,11 +2278,12 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB50_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB50_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -2313,12 +2316,13 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB51_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB51_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2342,11 +2346,12 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB51_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB51_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret_offset: @@ -2367,11 +2372,12 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB51_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB51_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst @@ -2411,10 +2417,11 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB52_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB52_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2437,10 +2444,11 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB52_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB52_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2464,10 +2472,11 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB52_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB52_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2508,10 +2517,11 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB53_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB53_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2536,10 +2546,11 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB53_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_ret_offset: @@ -2561,10 +2572,11 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB53_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2606,12 +2618,13 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB54_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB54_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v8, 1 ; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2642,11 +2655,12 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB54_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB54_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret_scalar: @@ -2668,11 +2682,12 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB54_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB54_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -2711,12 +2726,13 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB55_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB55_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v8, 1 ; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2747,11 +2763,12 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB55_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret_offset_scalar: @@ -2773,11 +2790,12 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB55_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB55_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst @@ -2819,10 +2837,11 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB56_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB56_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v6, 1 ; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2854,10 +2873,11 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB56_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB56_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_ret_scalar: @@ -2880,10 +2900,11 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB56_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB56_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -2924,10 +2945,11 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB57_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB57_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v6, 1 ; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2959,10 +2981,11 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB57_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB57_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_ret_offset_scalar: @@ -2985,10 +3008,11 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB57_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB57_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst @@ -3022,12 +3046,13 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB58_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB58_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3051,11 +3076,12 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB58_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB58_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: @@ -3076,11 +3102,12 @@ define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB58_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB58_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3120,10 +3147,11 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB59_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB59_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3148,10 +3176,11 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB59_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB59_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: @@ -3173,10 +3202,11 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB59_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4091,12 +4121,13 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB80_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB80_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4117,11 +4148,12 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB80_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB80_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret: @@ -4141,11 +4173,12 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB80_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB80_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -4177,12 +4210,13 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB81_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB81_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4205,11 +4239,12 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB81_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB81_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret_offset: @@ -4229,11 +4264,12 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB81_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB81_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst @@ -4272,10 +4308,11 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB82_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB82_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4297,10 +4334,11 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB82_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB82_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4323,10 +4361,11 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB82_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB82_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4366,10 +4405,11 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB83_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB83_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4393,10 +4433,11 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB83_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB83_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_ret_offset: @@ -4417,10 +4458,11 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB83_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB83_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4463,12 +4505,13 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB84_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4500,11 +4543,12 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB84_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB84_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret_scalar: @@ -4527,11 +4571,12 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB84_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB84_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -4571,12 +4616,13 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB85_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4608,11 +4654,12 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB85_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret_offset_scalar: @@ -4635,11 +4682,12 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB85_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB85_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst @@ -4682,10 +4730,11 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB86_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4718,10 +4767,11 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB86_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB86_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_ret_scalar: @@ -4745,10 +4795,11 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB86_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB86_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -4790,10 +4841,11 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB87_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB87_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4826,10 +4878,11 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB87_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_ret_offset_scalar: @@ -4853,10 +4906,11 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB87_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB87_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst @@ -4896,10 +4950,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; SI-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB88_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -4933,9 +4989,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] +; VI-NEXT: s_and_b64 s[6:7], s[0:1], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB88_1 +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB88_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4966,9 +5024,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB88_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB88_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -5010,12 +5070,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; SI-NEXT: s_and_b64 s[12:13], s[6:7], -1 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB89_1 +; SI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB89_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -5053,10 +5114,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; VI-NEXT: s_cbranch_execnz .LBB89_1 +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] +; VI-NEXT: s_and_b64 s[6:7], s[0:1], -1 +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; VI-NEXT: s_cbranch_scc1 .LBB89_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5089,10 +5151,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB89_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX9-NEXT: s_cbranch_scc1 .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -5137,10 +5200,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; SI-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB90_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB90_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -5172,9 +5237,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; VI-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB90_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -5205,9 +5272,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB90_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB90_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -5248,12 +5317,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; SI-NEXT: s_and_b64 s[12:13], s[6:7], -1 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB91_1 +; SI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB91_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -5289,10 +5359,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB91_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -5325,10 +5396,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB91_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX9-NEXT: s_cbranch_scc1 .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -5365,12 +5437,13 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB92_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB92_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5393,11 +5466,12 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB92_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB92_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: @@ -5417,11 +5491,12 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB92_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB92_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -5460,10 +5535,11 @@ define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB93_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB93_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5487,10 +5563,11 @@ define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB93_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB93_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: @@ -5511,10 +5588,11 @@ define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB93_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB93_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5553,12 +5631,13 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB94_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB94_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5579,11 +5658,12 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB94_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB94_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret: @@ -5603,11 +5683,12 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB94_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB94_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -5639,12 +5720,13 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB95_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB95_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5667,11 +5749,12 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB95_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB95_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret_offset: @@ -5691,11 +5774,12 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB95_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB95_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst @@ -5734,10 +5818,11 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB96_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB96_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5759,10 +5844,11 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB96_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB96_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5785,10 +5871,11 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB96_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB96_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5828,10 +5915,11 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB97_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB97_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5855,10 +5943,11 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB97_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB97_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_ret_offset: @@ -5879,10 +5968,11 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB97_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB97_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5925,12 +6015,13 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB98_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB98_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -5962,11 +6053,12 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB98_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB98_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret_scalar: @@ -5989,11 +6081,12 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB98_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB98_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -6033,12 +6126,13 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB99_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6070,11 +6164,12 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB99_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB99_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret_offset_scalar: @@ -6097,11 +6192,12 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB99_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB99_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst @@ -6144,10 +6240,11 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB100_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6180,10 +6277,11 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB100_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB100_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_ret_scalar: @@ -6207,10 +6305,11 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB100_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB100_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -6252,10 +6351,11 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB101_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -6288,10 +6388,11 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB101_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_ret_offset_scalar: @@ -6315,10 +6416,11 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB101_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB101_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst @@ -6358,10 +6460,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; SI-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB102_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -6395,9 +6499,11 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] +; VI-NEXT: s_and_b64 s[6:7], s[0:1], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB102_1 +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB102_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -6428,9 +6534,11 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB102_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB102_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -6472,12 +6580,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; SI-NEXT: s_and_b64 s[12:13], s[6:7], -1 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB103_1 +; SI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB103_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -6515,10 +6624,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; VI-NEXT: s_cbranch_execnz .LBB103_1 +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] +; VI-NEXT: s_and_b64 s[6:7], s[0:1], -1 +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; VI-NEXT: s_cbranch_scc1 .LBB103_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -6551,10 +6661,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB103_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX9-NEXT: s_cbranch_scc1 .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -6598,12 +6709,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; SI-NEXT: s_and_b64 s[12:13], s[6:7], -1 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB104_1 +; SI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB104_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -6639,10 +6751,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB104_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -6675,10 +6788,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB104_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX9-NEXT: s_cbranch_scc1 .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -6715,12 +6829,13 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB105_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB105_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6743,11 +6858,12 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB105_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB105_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: @@ -6767,11 +6883,12 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB105_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB105_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -6810,10 +6927,11 @@ define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB106_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB106_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6837,10 +6955,11 @@ define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB106_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB106_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: @@ -6861,10 +6980,11 @@ define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB106_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB106_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6903,12 +7023,13 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB107_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB107_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6929,11 +7050,12 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB107_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB107_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret: @@ -6953,11 +7075,12 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB107_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB107_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -6989,12 +7112,13 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB108_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB108_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7017,11 +7141,12 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB108_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB108_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret_offset: @@ -7041,11 +7166,12 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB108_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst @@ -7084,10 +7210,11 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB109_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB109_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7109,10 +7236,11 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB109_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB109_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7135,10 +7263,11 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB109_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7178,10 +7307,11 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB110_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB110_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7205,10 +7335,11 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB110_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB110_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_ret_offset: @@ -7229,10 +7360,11 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB110_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB110_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7275,12 +7407,13 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB111_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7312,11 +7445,12 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB111_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB111_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret_scalar: @@ -7339,11 +7473,12 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB111_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB111_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -7383,12 +7518,13 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB112_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB112_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7420,11 +7556,12 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB112_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB112_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret_offset_scalar: @@ -7447,11 +7584,12 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB112_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB112_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst @@ -7494,10 +7632,11 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB113_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB113_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7530,10 +7669,11 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB113_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB113_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_ret_scalar: @@ -7557,10 +7697,11 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB113_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB113_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -7602,10 +7743,11 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB114_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB114_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -7638,10 +7780,11 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB114_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_ret_offset_scalar: @@ -7665,10 +7808,11 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB114_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB114_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst @@ -7701,12 +7845,13 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB115_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB115_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7729,11 +7874,12 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB115_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB115_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: @@ -7753,11 +7899,12 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB115_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB115_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7796,10 +7943,11 @@ define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB116_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB116_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7823,10 +7971,11 @@ define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB116_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB116_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: @@ -7847,10 +7996,11 @@ define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB116_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB116_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7889,12 +8039,13 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB117_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB117_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7915,11 +8066,12 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB117_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB117_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret: @@ -7939,11 +8091,12 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB117_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB117_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -7975,12 +8128,13 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB118_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB118_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8003,11 +8157,12 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB118_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB118_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret_offset: @@ -8027,11 +8182,12 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB118_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB118_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst @@ -8070,10 +8226,11 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB119_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB119_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8095,10 +8252,11 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB119_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB119_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -8121,10 +8279,11 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB119_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB119_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8164,10 +8323,11 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB120_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB120_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8191,10 +8351,11 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB120_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB120_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_ret_offset: @@ -8215,10 +8376,11 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB120_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB120_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8261,12 +8423,13 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB121_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8298,11 +8461,12 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB121_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB121_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret_scalar: @@ -8325,11 +8489,12 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB121_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB121_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -8369,12 +8534,13 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB122_1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB122_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8406,11 +8572,12 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB122_1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB122_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret_offset_scalar: @@ -8433,11 +8600,12 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB122_1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB122_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst @@ -8480,10 +8648,11 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB123_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB123_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8516,10 +8685,11 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB123_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB123_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_ret_scalar: @@ -8543,10 +8713,11 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB123_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB123_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -8588,10 +8759,11 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB124_1 +; SI-NEXT: s_andn2_b64 s[38:39], exec, s[36:37] +; SI-NEXT: s_and_b64 s[40:41], s[38:39], -1 +; SI-NEXT: s_cselect_b64 exec, s[38:39], s[36:37] +; SI-NEXT: s_cbranch_scc1 .LBB124_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 ; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8624,10 +8796,11 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB124_1 +; VI-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; VI-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; VI-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; VI-NEXT: s_cbranch_scc1 .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_ret_offset_scalar: @@ -8651,10 +8824,11 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB124_1 +; GFX9-NEXT: s_andn2_b64 s[36:37], exec, s[34:35] +; GFX9-NEXT: s_and_b64 s[38:39], s[36:37], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[36:37], s[34:35] +; GFX9-NEXT: s_cbranch_scc1 .LBB124_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst @@ -8694,10 +8868,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; SI-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB125_1 +; SI-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -8731,9 +8907,11 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] +; VI-NEXT: s_and_b64 s[6:7], s[0:1], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB125_1 +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB125_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -8764,9 +8942,11 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB125_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB125_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -8808,12 +8988,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; SI-NEXT: s_and_b64 s[12:13], s[6:7], -1 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB126_1 +; SI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB126_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -8851,10 +9032,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; VI-NEXT: s_cbranch_execnz .LBB126_1 +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] +; VI-NEXT: s_and_b64 s[6:7], s[0:1], -1 +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; VI-NEXT: s_cbranch_scc1 .LBB126_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -8887,10 +9069,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB126_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX9-NEXT: s_cbranch_scc1 .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -8933,10 +9116,12 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[0:1], exec, s[8:9] +; SI-NEXT: s_and_b64 s[10:11], s[0:1], -1 ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB127_1 +; SI-NEXT: s_cselect_b64 exec, s[0:1], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB127_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -8964,9 +9149,11 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[0:1], exec, s[4:5] +; VI-NEXT: s_and_b64 s[6:7], s[0:1], -1 ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB127_1 +; VI-NEXT: s_cselect_b64 exec, s[0:1], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB127_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -8993,9 +9180,11 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB127_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB127_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -9035,12 +9224,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; SI-NEXT: s_and_b64 s[12:13], s[6:7], -1 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB128_1 +; SI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; SI-NEXT: s_cbranch_scc1 .LBB128_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -9076,10 +9266,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB128_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; VI-NEXT: s_cbranch_scc1 .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -9112,10 +9303,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB128_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX9-NEXT: s_cbranch_scc1 .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -9152,12 +9344,13 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB129_1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB129_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9180,11 +9373,12 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB129_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB129_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: @@ -9204,11 +9398,12 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB129_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB129_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -9247,10 +9442,11 @@ define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB130_1 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[8:9] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[8:9] +; SI-NEXT: s_cbranch_scc1 .LBB130_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9274,10 +9470,11 @@ define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB130_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB130_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: @@ -9298,10 +9495,11 @@ define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB130_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB130_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index d7773f746c6a6..cd3f640e5a270 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -22,8 +22,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -46,9 +47,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX7LESS-NEXT: .LBB0_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -58,8 +61,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -78,9 +82,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-NEXT: .LBB0_3: ; GFX9-NEXT: s_endpgm ; @@ -90,8 +96,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -111,8 +118,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1064-NEXT: .LBB0_3: ; GFX1064-NEXT: s_endpgm ; @@ -122,8 +131,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -142,20 +152,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -173,11 +187,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 @@ -197,8 +212,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -217,9 +233,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-DPP-NEXT: .LBB0_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -229,8 +247,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -250,8 +269,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1064-DPP-NEXT: .LBB0_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -261,8 +282,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -281,20 +303,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -312,11 +338,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2 @@ -378,9 +405,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -427,9 +456,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -444,9 +474,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX9-NEXT: .LBB1_5: ; GFX9-NEXT: s_endpgm ; @@ -493,9 +525,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB1_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -511,8 +544,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1064-NEXT: .LBB1_5: ; GFX1064-NEXT: s_endpgm ; @@ -559,9 +594,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB1_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -576,8 +612,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1032-NEXT: .LBB1_5: ; GFX1032-NEXT: s_endpgm ; @@ -615,12 +653,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB1_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -663,11 +702,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB1_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -738,8 +778,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -754,9 +795,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-DPP-NEXT: .LBB1_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -810,18 +853,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -837,8 +883,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1064-DPP-NEXT: .LBB1_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -891,14 +939,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -913,8 +964,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1032-DPP-NEXT: .LBB1_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -964,21 +1017,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -1031,15 +1087,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -1055,18 +1114,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 -; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -1093,9 +1153,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -1111,8 +1173,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1135,9 +1198,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; @@ -1153,8 +1218,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 @@ -1176,8 +1242,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1064-NEXT: .LBB2_3: ; GFX1064-NEXT: s_endpgm ; @@ -1193,8 +1261,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 @@ -1215,8 +1284,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; @@ -1226,15 +1297,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1257,9 +1329,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1164-NEXT: .LBB2_3: ; GFX1164-NEXT: s_endpgm ; @@ -1270,13 +1344,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1297,9 +1372,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; @@ -1315,8 +1392,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1339,9 +1417,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -1357,8 +1437,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 @@ -1380,8 +1461,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1064-DPP-NEXT: .LBB2_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -1397,8 +1480,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -1419,8 +1503,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -1430,15 +1516,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1461,9 +1548,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1164-DPP-NEXT: .LBB2_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -1474,13 +1563,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1501,9 +1591,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic @@ -1556,9 +1648,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -1605,9 +1699,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -1622,9 +1717,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; @@ -1671,9 +1768,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -1689,8 +1787,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1064-NEXT: .LBB3_5: ; GFX1064-NEXT: s_endpgm ; @@ -1737,9 +1837,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -1754,8 +1855,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1032-NEXT: .LBB3_5: ; GFX1032-NEXT: s_endpgm ; @@ -1793,12 +1896,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -1814,9 +1918,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1164-NEXT: .LBB3_5: ; GFX1164-NEXT: s_endpgm ; @@ -1855,11 +1961,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -1874,9 +1981,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; @@ -1942,8 +2051,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1958,9 +2068,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2014,18 +2126,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2041,8 +2156,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1064-DPP-NEXT: .LBB3_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -2095,14 +2212,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2117,8 +2237,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1032-DPP-NEXT: .LBB3_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -2168,21 +2290,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2198,9 +2323,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1164-DPP-NEXT: .LBB3_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2248,16 +2375,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2272,9 +2402,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1132-DPP-NEXT: .LBB3_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() strictfp @@ -2285,18 +2417,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 -; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -2323,9 +2456,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX7LESS-NEXT: .LBB4_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -2341,8 +2476,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -2365,9 +2501,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-NEXT: .LBB4_3: ; GFX9-NEXT: s_endpgm ; @@ -2383,8 +2521,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 @@ -2406,8 +2545,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1064-NEXT: .LBB4_3: ; GFX1064-NEXT: s_endpgm ; @@ -2423,8 +2564,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 @@ -2445,8 +2587,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; @@ -2456,15 +2600,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2487,9 +2632,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1164-NEXT: .LBB4_3: ; GFX1164-NEXT: s_endpgm ; @@ -2500,13 +2647,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2527,9 +2675,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; @@ -2545,8 +2695,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -2569,9 +2720,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-DPP-NEXT: .LBB4_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2587,8 +2740,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 @@ -2610,8 +2764,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1064-DPP-NEXT: .LBB4_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -2627,8 +2783,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -2649,8 +2806,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -2660,15 +2819,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2691,9 +2851,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1164-DPP-NEXT: .LBB4_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2704,13 +2866,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2731,9 +2894,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic @@ -2786,9 +2951,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -2835,9 +3002,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -2852,9 +3020,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9-NEXT: .LBB5_5: ; GFX9-NEXT: s_endpgm ; @@ -2901,9 +3071,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -2919,8 +3090,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064-NEXT: .LBB5_5: ; GFX1064-NEXT: s_endpgm ; @@ -2967,9 +3140,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -2984,8 +3158,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032-NEXT: .LBB5_5: ; GFX1032-NEXT: s_endpgm ; @@ -3023,12 +3199,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -3071,11 +3248,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB5_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -3146,8 +3324,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3162,9 +3341,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX9-DPP-NEXT: .LBB5_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -3218,18 +3399,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3245,8 +3429,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1064-DPP-NEXT: .LBB5_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -3299,14 +3485,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3321,8 +3510,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1032-DPP-NEXT: .LBB5_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -3372,21 +3563,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -3439,15 +3633,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -3506,9 +3703,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -3555,9 +3754,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB6_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -3572,9 +3772,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB6_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX9-NEXT: .LBB6_5: ; GFX9-NEXT: s_endpgm ; @@ -3621,9 +3823,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB6_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -3639,8 +3842,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX1064-NEXT: .LBB6_5: ; GFX1064-NEXT: s_endpgm ; @@ -3687,9 +3892,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB6_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -3704,8 +3910,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX1032-NEXT: .LBB6_5: ; GFX1032-NEXT: s_endpgm ; @@ -3743,12 +3951,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB6_4 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 @@ -3791,11 +4000,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB6_4 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB6_4 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 @@ -3866,8 +4076,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3882,9 +4093,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -3938,18 +4151,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3965,8 +4181,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -4019,14 +4237,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -4041,8 +4262,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -4092,21 +4315,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -4159,15 +4385,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -4183,18 +4412,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 -; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -4221,9 +4451,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX7LESS-NEXT: .LBB7_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -4239,8 +4471,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -4263,9 +4496,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX9-NEXT: .LBB7_3: ; GFX9-NEXT: s_endpgm ; @@ -4281,8 +4516,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 @@ -4304,8 +4540,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1064-NEXT: .LBB7_3: ; GFX1064-NEXT: s_endpgm ; @@ -4321,8 +4559,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 @@ -4343,8 +4582,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm ; @@ -4354,15 +4595,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -4385,9 +4627,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1164-NEXT: .LBB7_3: ; GFX1164-NEXT: s_endpgm ; @@ -4398,13 +4642,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -4425,9 +4670,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; @@ -4443,8 +4690,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -4467,9 +4715,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -4485,8 +4735,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 @@ -4508,8 +4759,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -4525,8 +4778,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -4547,8 +4801,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -4558,15 +4814,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -4589,9 +4846,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -4602,13 +4861,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -4629,9 +4889,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 @@ -4683,9 +4945,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -4732,9 +4996,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB8_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -4749,9 +5014,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB8_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX9-NEXT: .LBB8_5: ; GFX9-NEXT: s_endpgm ; @@ -4798,9 +5065,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB8_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -4816,8 +5084,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1064-NEXT: .LBB8_5: ; GFX1064-NEXT: s_endpgm ; @@ -4864,9 +5134,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB8_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -4881,8 +5152,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1032-NEXT: .LBB8_5: ; GFX1032-NEXT: s_endpgm ; @@ -4920,12 +5193,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB8_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -4941,9 +5215,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1164-NEXT: .LBB8_5: ; GFX1164-NEXT: s_endpgm ; @@ -4982,11 +5258,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB8_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -5001,9 +5278,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1132-NEXT: .LBB8_5: ; GFX1132-NEXT: s_endpgm ; @@ -5069,8 +5348,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -5085,9 +5365,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-DPP-NEXT: .LBB8_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -5141,18 +5423,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5168,8 +5453,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1064-DPP-NEXT: .LBB8_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -5222,14 +5509,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5244,8 +5534,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -5295,21 +5587,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5325,9 +5620,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1164-DPP-NEXT: .LBB8_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -5375,16 +5672,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5399,9 +5699,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() strictfp @@ -5426,8 +5728,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5469,13 +5772,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -5492,11 +5797,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s40, s40, s3 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_mov_b32 s33, s2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -5541,8 +5847,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; @@ -5562,8 +5870,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 @@ -5609,8 +5918,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; @@ -5629,9 +5940,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 ; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 @@ -5676,8 +5988,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm ; @@ -5689,11 +6003,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 @@ -5739,8 +6054,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1164-NEXT: .LBB9_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -5753,9 +6071,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2 ; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 @@ -5796,8 +6116,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1132-NEXT: .LBB9_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -5815,11 +6138,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-DPP-NEXT: s_mov_b32 s33, s2 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -5864,8 +6188,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -5885,8 +6211,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 @@ -5932,8 +6259,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -5952,9 +6281,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 @@ -5999,8 +6329,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -6012,11 +6344,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 @@ -6062,8 +6395,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -6076,9 +6412,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 ; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 @@ -6119,8 +6457,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -6205,13 +6546,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -6292,8 +6635,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -6375,8 +6720,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -6458,8 +6805,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -6530,8 +6879,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -6596,8 +6948,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -6679,8 +7034,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -6762,8 +7119,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -6845,8 +7204,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -6917,8 +7278,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -6983,8 +7347,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -7006,8 +7373,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] @@ -7036,10 +7404,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX7LESS-NEXT: .LBB11_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -7055,8 +7425,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -7080,9 +7451,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX9-NEXT: .LBB11_3: ; GFX9-NEXT: s_endpgm ; @@ -7098,8 +7471,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 @@ -7122,8 +7496,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1064-NEXT: .LBB11_3: ; GFX1064-NEXT: s_endpgm ; @@ -7139,8 +7515,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 @@ -7162,8 +7539,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1032-NEXT: .LBB11_3: ; GFX1032-NEXT: s_endpgm ; @@ -7173,15 +7552,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -7205,9 +7585,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1164-NEXT: .LBB11_3: ; GFX1164-NEXT: s_endpgm ; @@ -7218,13 +7600,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -7245,9 +7628,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm ; @@ -7263,8 +7648,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -7288,9 +7674,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -7306,8 +7694,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 @@ -7330,8 +7719,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -7347,8 +7738,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -7370,8 +7762,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -7381,15 +7775,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -7413,9 +7808,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7426,13 +7823,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -7453,9 +7851,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic @@ -7509,10 +7909,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -7556,9 +7958,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -7603,8 +8007,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v5, v3 ; GFX1064-NEXT: v_mov_b32_e32 v4, v2 ; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1064-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -7649,8 +8055,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v5, v3 ; GFX1032-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1032-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -7685,9 +8093,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; @@ -7720,9 +8130,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; @@ -7766,9 +8178,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -7813,8 +8227,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -7859,8 +8275,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -7895,9 +8313,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7930,9 +8350,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() strictfp @@ -7953,8 +8375,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] @@ -7983,10 +8406,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX7LESS-NEXT: .LBB13_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -8002,8 +8427,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -8027,9 +8453,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-NEXT: .LBB13_3: ; GFX9-NEXT: s_endpgm ; @@ -8045,8 +8473,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 @@ -8069,8 +8498,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1064-NEXT: .LBB13_3: ; GFX1064-NEXT: s_endpgm ; @@ -8086,8 +8517,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 @@ -8109,8 +8541,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm ; @@ -8120,15 +8554,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8152,9 +8587,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1164-NEXT: .LBB13_3: ; GFX1164-NEXT: s_endpgm ; @@ -8165,13 +8602,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8192,9 +8630,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; @@ -8210,8 +8650,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -8235,9 +8676,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-DPP-NEXT: .LBB13_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -8253,8 +8696,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 @@ -8277,8 +8721,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1064-DPP-NEXT: .LBB13_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -8294,8 +8740,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -8317,8 +8764,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -8328,15 +8777,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8360,9 +8810,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1164-DPP-NEXT: .LBB13_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -8373,13 +8825,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8400,9 +8853,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic @@ -8456,10 +8911,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -8503,9 +8960,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -8550,8 +9009,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v5, v3 ; GFX1064-NEXT: v_mov_b32_e32 v4, v2 ; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1064-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -8596,8 +9057,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v5, v3 ; GFX1032-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1032-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -8632,9 +9095,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; @@ -8667,9 +9132,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; @@ -8713,9 +9180,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -8760,8 +9229,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -8806,8 +9277,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -8842,9 +9315,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; @@ -8877,9 +9352,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -8934,10 +9411,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -8981,9 +9460,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -9028,8 +9509,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v5, v3 ; GFX1064-NEXT: v_mov_b32_e32 v4, v2 ; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1064-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -9074,8 +9557,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v5, v3 ; GFX1032-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1032-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -9110,9 +9595,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; @@ -9145,9 +9632,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; @@ -9191,9 +9680,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -9238,8 +9729,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -9284,8 +9777,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -9320,9 +9815,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; @@ -9355,9 +9852,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp @@ -9382,8 +9881,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -9428,13 +9928,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -9451,11 +9953,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s40, s40, s3 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_mov_b32 s33, s2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -9503,8 +10006,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; @@ -9524,8 +10029,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 @@ -9572,8 +10078,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; @@ -9592,9 +10100,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 @@ -9640,8 +10149,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; @@ -9659,10 +10170,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -9709,8 +10221,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1164-NEXT: .LBB16_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -9727,11 +10242,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -9772,8 +10288,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1132-NEXT: .LBB16_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -9791,11 +10310,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-DPP-NEXT: s_mov_b32 s33, s2 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -9843,8 +10363,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -9864,8 +10386,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -9912,8 +10435,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -9932,9 +10457,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -9980,8 +10506,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -9999,10 +10527,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -10049,8 +10578,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -10067,11 +10599,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -10112,8 +10645,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -10198,13 +10734,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -10285,8 +10823,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -10368,8 +10908,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -10451,8 +10993,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -10523,8 +11067,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -10589,8 +11136,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -10672,8 +11222,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -10755,8 +11307,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -10838,8 +11392,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -10910,8 +11466,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -10976,8 +11535,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -10993,8 +11555,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB18_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -11017,9 +11580,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB18_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX7LESS-NEXT: .LBB18_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -11029,8 +11594,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB18_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -11049,9 +11615,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB18_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX9-NEXT: .LBB18_3: ; GFX9-NEXT: s_endpgm ; @@ -11061,8 +11629,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB18_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -11082,8 +11651,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX1064-NEXT: .LBB18_3: ; GFX1064-NEXT: s_endpgm ; @@ -11093,8 +11664,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB18_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -11113,20 +11685,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX1032-NEXT: .LBB18_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB18_3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -11148,9 +11724,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX1164-NEXT: .LBB18_3: ; GFX1164-NEXT: s_endpgm ; @@ -11159,10 +11737,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB18_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -11182,9 +11761,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX1132-NEXT: .LBB18_3: ; GFX1132-NEXT: s_endpgm ; @@ -11194,8 +11775,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -11214,9 +11796,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX9-DPP-NEXT: .LBB18_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -11226,8 +11810,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -11247,8 +11832,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX1064-DPP-NEXT: .LBB18_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -11258,8 +11845,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -11278,20 +11866,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX1032-DPP-NEXT: .LBB18_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -11313,9 +11905,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX1164-DPP-NEXT: .LBB18_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -11324,10 +11918,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB18_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -11347,9 +11942,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB18_2 ; GFX1132-DPP-NEXT: .LBB18_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1 @@ -11363,8 +11960,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB19_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -11387,9 +11985,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB19_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX7LESS-NEXT: .LBB19_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -11399,8 +11999,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB19_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -11419,9 +12020,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB19_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX9-NEXT: .LBB19_3: ; GFX9-NEXT: s_endpgm ; @@ -11431,8 +12034,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB19_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -11452,8 +12056,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX1064-NEXT: .LBB19_3: ; GFX1064-NEXT: s_endpgm ; @@ -11463,8 +12069,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB19_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -11483,20 +12090,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX1032-NEXT: .LBB19_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB19_3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -11518,9 +12129,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX1164-NEXT: .LBB19_3: ; GFX1164-NEXT: s_endpgm ; @@ -11529,10 +12142,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB19_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -11552,9 +12166,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX1132-NEXT: .LBB19_3: ; GFX1132-NEXT: s_endpgm ; @@ -11564,8 +12180,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -11584,9 +12201,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX9-DPP-NEXT: .LBB19_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -11596,8 +12215,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -11617,8 +12237,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX1064-DPP-NEXT: .LBB19_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -11628,8 +12250,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -11648,20 +12271,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX1032-DPP-NEXT: .LBB19_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -11683,9 +12310,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX1164-DPP-NEXT: .LBB19_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -11694,10 +12323,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB19_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -11717,9 +12347,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB19_2 ; GFX1132-DPP-NEXT: .LBB19_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 98c09dfaa2d5a..6ffe74552fa5b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -21,8 +21,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -43,9 +44,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX7LESS-NEXT: .LBB0_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -54,8 +57,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -72,9 +76,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-NEXT: .LBB0_3: ; GFX9-NEXT: s_endpgm ; @@ -83,8 +89,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -102,8 +109,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1064-NEXT: .LBB0_3: ; GFX1064-NEXT: s_endpgm ; @@ -112,8 +121,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -130,19 +140,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -161,9 +174,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1164-NEXT: .LBB0_3: ; GFX1164-NEXT: s_endpgm ; @@ -171,10 +186,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -192,9 +208,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_endpgm ; @@ -203,8 +221,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -221,9 +240,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-DPP-NEXT: .LBB0_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -232,8 +253,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -251,8 +273,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1064-DPP-NEXT: .LBB0_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -261,8 +285,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -279,19 +304,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -310,9 +338,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1164-DPP-NEXT: .LBB0_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -320,10 +350,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -341,9 +372,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1132-DPP-NEXT: .LBB0_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 @@ -397,9 +430,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -448,9 +483,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -467,9 +503,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX9-NEXT: .LBB1_5: ; GFX9-NEXT: s_endpgm ; @@ -518,9 +556,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB1_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -538,8 +577,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1064-NEXT: .LBB1_5: ; GFX1064-NEXT: s_endpgm ; @@ -588,9 +629,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB1_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -607,8 +649,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1032-NEXT: .LBB1_5: ; GFX1032-NEXT: s_endpgm ; @@ -648,12 +692,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB1_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -672,9 +717,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1164-NEXT: .LBB1_5: ; GFX1164-NEXT: s_endpgm ; @@ -715,11 +762,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB1_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2 @@ -736,9 +784,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1132-NEXT: .LBB1_5: ; GFX1132-NEXT: s_endpgm ; @@ -811,8 +861,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -829,9 +880,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-DPP-NEXT: .LBB1_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -893,18 +946,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -922,8 +978,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1064-DPP-NEXT: .LBB1_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -972,9 +1030,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 @@ -982,14 +1040,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1006,8 +1067,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1032-DPP-NEXT: .LBB1_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -1048,12 +1111,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -1067,21 +1130,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -1100,9 +1166,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1164-DPP-NEXT: .LBB1_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -1141,10 +1209,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0xff800000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -1154,18 +1222,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -1183,9 +1254,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1132-DPP-NEXT: .LBB1_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -1199,8 +1272,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1221,9 +1295,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -1232,8 +1308,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -1250,9 +1327,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; @@ -1261,8 +1340,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -1280,8 +1360,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1064-NEXT: .LBB2_3: ; GFX1064-NEXT: s_endpgm ; @@ -1290,8 +1372,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -1308,19 +1391,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -1339,9 +1425,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1164-NEXT: .LBB2_3: ; GFX1164-NEXT: s_endpgm ; @@ -1349,10 +1437,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -1370,9 +1459,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; @@ -1381,8 +1472,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -1399,9 +1491,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -1410,8 +1504,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1429,8 +1524,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1064-DPP-NEXT: .LBB2_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -1439,8 +1536,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1457,19 +1555,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1488,9 +1589,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1164-DPP-NEXT: .LBB2_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -1498,10 +1601,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1519,9 +1623,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic @@ -1576,9 +1682,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -1627,9 +1735,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -1646,9 +1755,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; @@ -1697,9 +1808,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -1717,8 +1829,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1064-NEXT: .LBB3_5: ; GFX1064-NEXT: s_endpgm ; @@ -1767,9 +1881,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -1786,8 +1901,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1032-NEXT: .LBB3_5: ; GFX1032-NEXT: s_endpgm ; @@ -1827,12 +1944,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -1851,9 +1969,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1164-NEXT: .LBB3_5: ; GFX1164-NEXT: s_endpgm ; @@ -1894,11 +2014,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2 @@ -1915,9 +2036,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; @@ -1990,8 +2113,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2008,9 +2132,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2072,18 +2198,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2101,8 +2230,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1064-DPP-NEXT: .LBB3_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -2151,9 +2282,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 @@ -2161,14 +2292,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2185,8 +2319,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1032-DPP-NEXT: .LBB3_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -2227,12 +2363,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -2246,21 +2382,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -2279,9 +2418,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1164-DPP-NEXT: .LBB3_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2320,10 +2461,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0xff800000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -2333,18 +2474,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -2362,9 +2506,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1132-DPP-NEXT: .LBB3_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -2379,8 +2525,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -2401,9 +2548,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX7LESS-NEXT: .LBB4_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -2412,8 +2561,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -2430,9 +2580,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-NEXT: .LBB4_3: ; GFX9-NEXT: s_endpgm ; @@ -2441,8 +2593,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -2460,8 +2613,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1064-NEXT: .LBB4_3: ; GFX1064-NEXT: s_endpgm ; @@ -2470,8 +2625,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -2488,19 +2644,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -2519,9 +2678,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1164-NEXT: .LBB4_3: ; GFX1164-NEXT: s_endpgm ; @@ -2529,10 +2690,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -2550,9 +2712,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; @@ -2561,8 +2725,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -2579,9 +2744,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-DPP-NEXT: .LBB4_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2590,8 +2757,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2609,8 +2777,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1064-DPP-NEXT: .LBB4_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -2619,8 +2789,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2637,19 +2808,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2668,9 +2842,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1164-DPP-NEXT: .LBB4_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2678,10 +2854,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2699,9 +2876,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 @@ -2755,9 +2934,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -2806,9 +2987,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -2825,9 +3007,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9-NEXT: .LBB5_5: ; GFX9-NEXT: s_endpgm ; @@ -2876,9 +3060,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -2896,8 +3081,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064-NEXT: .LBB5_5: ; GFX1064-NEXT: s_endpgm ; @@ -2946,9 +3133,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -2965,8 +3153,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032-NEXT: .LBB5_5: ; GFX1032-NEXT: s_endpgm ; @@ -3006,12 +3196,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB5_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -3030,9 +3221,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164-NEXT: .LBB5_5: ; GFX1164-NEXT: s_endpgm ; @@ -3073,11 +3266,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB5_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2 @@ -3094,9 +3288,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132-NEXT: .LBB5_5: ; GFX1132-NEXT: s_endpgm ; @@ -3169,8 +3365,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3187,9 +3384,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX9-DPP-NEXT: .LBB5_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -3251,18 +3450,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3280,8 +3482,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1064-DPP-NEXT: .LBB5_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -3330,9 +3534,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 @@ -3340,14 +3544,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3364,8 +3571,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1032-DPP-NEXT: .LBB5_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -3406,12 +3615,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -3425,21 +3634,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -3458,9 +3670,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1164-DPP-NEXT: .LBB5_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -3499,10 +3713,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0xff800000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -3512,18 +3726,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -3541,9 +3758,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1132-DPP-NEXT: .LBB5_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -3566,8 +3785,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s2 ; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 @@ -3608,13 +3828,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -3629,11 +3851,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s40, s40, s3 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX9-NEXT: s_mov_b32 s33, s2 @@ -3677,8 +3900,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm ; @@ -3696,8 +3921,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1064-NEXT: s_mov_b32 s33, s2 @@ -3742,8 +3968,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm ; @@ -3760,9 +3988,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1032-NEXT: s_mov_b32 s33, s2 @@ -3806,8 +4035,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm ; @@ -3817,11 +4048,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s2 @@ -3865,8 +4097,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1164-NEXT: .LBB6_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -3878,9 +4113,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 @@ -3919,8 +4156,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1132-NEXT: .LBB6_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -3936,11 +4176,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX9-DPP-NEXT: s_mov_b32 s33, s2 @@ -3984,8 +4225,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -4003,8 +4246,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 @@ -4049,8 +4293,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -4067,9 +4313,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 @@ -4113,8 +4360,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -4124,11 +4373,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 @@ -4172,8 +4422,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -4185,9 +4438,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 @@ -4226,8 +4481,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -4312,12 +4570,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -4398,8 +4658,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -4481,8 +4743,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -4564,8 +4828,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -4635,8 +4901,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -4703,8 +4972,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -4786,8 +5058,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -4869,8 +5143,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -4952,8 +5228,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -5023,8 +5301,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -5091,8 +5372,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -5107,8 +5391,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5132,10 +5417,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX7LESS-NEXT: .LBB8_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -5144,8 +5431,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -5164,9 +5452,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-NEXT: .LBB8_3: ; GFX9-NEXT: s_endpgm ; @@ -5175,8 +5465,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 @@ -5196,8 +5487,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1064-NEXT: .LBB8_3: ; GFX1064-NEXT: s_endpgm ; @@ -5206,8 +5499,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 @@ -5226,19 +5520,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1032-NEXT: .LBB8_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 @@ -5259,9 +5556,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1164-NEXT: .LBB8_3: ; GFX1164-NEXT: s_endpgm ; @@ -5269,10 +5568,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 @@ -5290,9 +5590,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm ; @@ -5301,8 +5603,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -5321,9 +5624,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-DPP-NEXT: .LBB8_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -5332,8 +5637,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5353,8 +5659,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1064-DPP-NEXT: .LBB8_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -5363,8 +5671,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5383,19 +5692,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5416,9 +5728,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1164-DPP-NEXT: .LBB8_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -5426,10 +5740,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5447,9 +5762,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic @@ -5505,10 +5822,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -5554,9 +5873,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -5603,8 +5924,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1064-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -5651,8 +5974,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1032-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -5690,9 +6015,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; @@ -5728,9 +6055,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; @@ -5776,9 +6105,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -5825,8 +6156,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -5873,8 +6206,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -5912,9 +6247,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; @@ -5950,9 +6287,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -5975,8 +6314,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s2 ; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 @@ -6017,13 +6357,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -6038,11 +6380,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s40, s40, s3 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX9-NEXT: s_mov_b32 s33, s2 @@ -6086,8 +6429,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm ; @@ -6105,8 +6450,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1064-NEXT: s_mov_b32 s33, s2 @@ -6151,8 +6497,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm ; @@ -6169,9 +6517,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1032-NEXT: s_mov_b32 s33, s2 @@ -6215,8 +6564,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm ; @@ -6226,11 +6577,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s2 @@ -6274,8 +6626,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1164-NEXT: .LBB10_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -6287,9 +6642,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 @@ -6328,8 +6685,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1132-NEXT: .LBB10_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -6345,11 +6705,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX9-DPP-NEXT: s_mov_b32 s33, s2 @@ -6393,8 +6754,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -6412,8 +6775,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 @@ -6458,8 +6822,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -6476,9 +6842,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 @@ -6522,8 +6889,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -6533,11 +6902,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 @@ -6581,8 +6951,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -6594,9 +6967,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 @@ -6635,8 +7010,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -6721,12 +7099,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -6807,8 +7187,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -6890,8 +7272,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -6973,8 +7357,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -7044,8 +7430,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -7112,8 +7501,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -7195,8 +7587,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -7278,8 +7672,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -7361,8 +7757,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -7432,8 +7830,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -7500,8 +7901,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -7516,8 +7920,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -7538,9 +7943,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX7LESS-NEXT: .LBB12_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -7549,8 +7956,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -7567,9 +7975,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX9-NEXT: .LBB12_3: ; GFX9-NEXT: s_endpgm ; @@ -7578,8 +7988,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -7597,8 +8008,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1064-NEXT: .LBB12_3: ; GFX1064-NEXT: s_endpgm ; @@ -7607,8 +8020,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -7625,19 +8039,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1032-NEXT: .LBB12_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -7656,9 +8073,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1164-NEXT: .LBB12_3: ; GFX1164-NEXT: s_endpgm ; @@ -7666,10 +8085,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -7687,9 +8107,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1132-NEXT: .LBB12_3: ; GFX1132-NEXT: s_endpgm ; @@ -7698,8 +8120,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -7716,9 +8139,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX9-DPP-NEXT: .LBB12_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -7727,8 +8152,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7746,8 +8172,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1064-DPP-NEXT: .LBB12_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -7756,8 +8184,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7774,19 +8203,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7805,9 +8237,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1164-DPP-NEXT: .LBB12_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7815,10 +8249,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7836,9 +8271,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1 @@ -7851,8 +8288,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -7873,9 +8311,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX7LESS-NEXT: .LBB13_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -7884,8 +8324,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -7902,9 +8343,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-NEXT: .LBB13_3: ; GFX9-NEXT: s_endpgm ; @@ -7913,8 +8356,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -7932,8 +8376,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1064-NEXT: .LBB13_3: ; GFX1064-NEXT: s_endpgm ; @@ -7942,8 +8388,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -7960,19 +8407,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -7991,9 +8441,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1164-NEXT: .LBB13_3: ; GFX1164-NEXT: s_endpgm ; @@ -8001,10 +8453,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -8022,9 +8475,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; @@ -8033,8 +8488,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -8051,9 +8507,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-DPP-NEXT: .LBB13_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -8062,8 +8520,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -8081,8 +8540,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1064-DPP-NEXT: .LBB13_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -8091,8 +8552,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -8109,19 +8571,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -8140,9 +8605,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1164-DPP-NEXT: .LBB13_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -8150,10 +8617,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -8171,9 +8639,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 1fb0db0e1f0d3..c06c92f1c1c57 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -21,8 +21,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -43,9 +44,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX7LESS-NEXT: .LBB0_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -54,8 +57,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -72,9 +76,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-NEXT: .LBB0_3: ; GFX9-NEXT: s_endpgm ; @@ -83,8 +89,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -102,8 +109,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1064-NEXT: .LBB0_3: ; GFX1064-NEXT: s_endpgm ; @@ -112,8 +121,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -130,19 +140,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -161,9 +174,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1164-NEXT: .LBB0_3: ; GFX1164-NEXT: s_endpgm ; @@ -171,10 +186,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -192,9 +208,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_endpgm ; @@ -203,8 +221,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -221,9 +240,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-DPP-NEXT: .LBB0_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -232,8 +253,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -251,8 +273,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1064-DPP-NEXT: .LBB0_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -261,8 +285,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -279,19 +304,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -310,9 +338,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1164-DPP-NEXT: .LBB0_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -320,10 +350,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -341,9 +372,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1132-DPP-NEXT: .LBB0_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 @@ -397,9 +430,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -448,9 +483,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -467,9 +503,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX9-NEXT: .LBB1_5: ; GFX9-NEXT: s_endpgm ; @@ -518,9 +556,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB1_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -538,8 +577,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1064-NEXT: .LBB1_5: ; GFX1064-NEXT: s_endpgm ; @@ -588,9 +629,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB1_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -607,8 +649,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1032-NEXT: .LBB1_5: ; GFX1032-NEXT: s_endpgm ; @@ -648,12 +692,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB1_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -672,9 +717,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1164-NEXT: .LBB1_5: ; GFX1164-NEXT: s_endpgm ; @@ -715,11 +762,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB1_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2 @@ -736,9 +784,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1132-NEXT: .LBB1_5: ; GFX1132-NEXT: s_endpgm ; @@ -811,8 +861,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -829,9 +880,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-DPP-NEXT: .LBB1_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -893,18 +946,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -922,8 +978,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1064-DPP-NEXT: .LBB1_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -972,9 +1030,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 @@ -982,14 +1040,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1006,8 +1067,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1032-DPP-NEXT: .LBB1_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -1048,12 +1111,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -1067,21 +1130,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -1100,9 +1166,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1164-DPP-NEXT: .LBB1_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -1141,10 +1209,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_dual_min_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0x7f800000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -1154,18 +1222,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -1183,9 +1254,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1132-DPP-NEXT: .LBB1_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -1199,8 +1272,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1221,9 +1295,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -1232,8 +1308,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -1250,9 +1327,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; @@ -1261,8 +1340,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -1280,8 +1360,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1064-NEXT: .LBB2_3: ; GFX1064-NEXT: s_endpgm ; @@ -1290,8 +1372,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -1308,19 +1391,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -1339,9 +1425,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1164-NEXT: .LBB2_3: ; GFX1164-NEXT: s_endpgm ; @@ -1349,10 +1437,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -1370,9 +1459,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; @@ -1381,8 +1472,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -1399,9 +1491,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -1410,8 +1504,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1429,8 +1524,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1064-DPP-NEXT: .LBB2_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -1439,8 +1536,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1457,19 +1555,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1488,9 +1589,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1164-DPP-NEXT: .LBB2_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -1498,10 +1601,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -1519,9 +1623,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic @@ -1576,9 +1682,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -1627,9 +1735,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -1646,9 +1755,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; @@ -1697,9 +1808,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -1717,8 +1829,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1064-NEXT: .LBB3_5: ; GFX1064-NEXT: s_endpgm ; @@ -1767,9 +1881,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -1786,8 +1901,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1032-NEXT: .LBB3_5: ; GFX1032-NEXT: s_endpgm ; @@ -1827,12 +1944,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -1851,9 +1969,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1164-NEXT: .LBB3_5: ; GFX1164-NEXT: s_endpgm ; @@ -1894,11 +2014,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2 @@ -1915,9 +2036,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; @@ -1990,8 +2113,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2008,9 +2132,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2072,18 +2198,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2101,8 +2230,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1064-DPP-NEXT: .LBB3_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -2151,9 +2282,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 @@ -2161,14 +2292,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2185,8 +2319,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1032-DPP-NEXT: .LBB3_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -2227,12 +2363,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -2246,21 +2382,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -2279,9 +2418,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1164-DPP-NEXT: .LBB3_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2320,10 +2461,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_dual_min_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0x7f800000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -2333,18 +2474,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -2362,9 +2506,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1132-DPP-NEXT: .LBB3_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -2379,8 +2525,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -2401,9 +2548,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX7LESS-NEXT: .LBB4_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -2412,8 +2561,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -2430,9 +2580,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-NEXT: .LBB4_3: ; GFX9-NEXT: s_endpgm ; @@ -2441,8 +2593,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -2460,8 +2613,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1064-NEXT: .LBB4_3: ; GFX1064-NEXT: s_endpgm ; @@ -2470,8 +2625,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -2488,19 +2644,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -2519,9 +2678,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1164-NEXT: .LBB4_3: ; GFX1164-NEXT: s_endpgm ; @@ -2529,10 +2690,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -2550,9 +2712,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; @@ -2561,8 +2725,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -2579,9 +2744,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-DPP-NEXT: .LBB4_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2590,8 +2757,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2609,8 +2777,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1064-DPP-NEXT: .LBB4_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -2619,8 +2789,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2637,19 +2808,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2668,9 +2842,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1164-DPP-NEXT: .LBB4_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2678,10 +2854,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2699,9 +2876,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 @@ -2755,9 +2934,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -2806,9 +2987,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -2825,9 +3007,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9-NEXT: .LBB5_5: ; GFX9-NEXT: s_endpgm ; @@ -2876,9 +3060,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -2896,8 +3081,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064-NEXT: .LBB5_5: ; GFX1064-NEXT: s_endpgm ; @@ -2946,9 +3133,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -2965,8 +3153,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032-NEXT: .LBB5_5: ; GFX1032-NEXT: s_endpgm ; @@ -3006,12 +3196,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB5_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -3030,9 +3221,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164-NEXT: .LBB5_5: ; GFX1164-NEXT: s_endpgm ; @@ -3073,11 +3266,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB5_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2 @@ -3094,9 +3288,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132-NEXT: .LBB5_5: ; GFX1132-NEXT: s_endpgm ; @@ -3169,8 +3365,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3187,9 +3384,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX9-DPP-NEXT: .LBB5_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -3251,18 +3450,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3280,8 +3482,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1064-DPP-NEXT: .LBB5_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -3330,9 +3534,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7f800000 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 @@ -3340,14 +3544,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3364,8 +3571,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1032-DPP-NEXT: .LBB5_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -3406,12 +3615,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -3425,21 +3634,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -3458,9 +3670,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1164-DPP-NEXT: .LBB5_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -3499,10 +3713,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_dual_min_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0x7f800000 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf @@ -3512,18 +3726,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -3541,9 +3758,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1132-DPP-NEXT: .LBB5_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -3566,8 +3785,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s2 ; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 @@ -3608,13 +3828,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -3629,11 +3851,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s40, s40, s3 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX9-NEXT: s_mov_b32 s33, s2 @@ -3677,8 +3900,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm ; @@ -3696,8 +3921,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1064-NEXT: s_mov_b32 s33, s2 @@ -3742,8 +3968,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm ; @@ -3760,9 +3988,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1032-NEXT: s_mov_b32 s33, s2 @@ -3806,8 +4035,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm ; @@ -3817,11 +4048,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s2 @@ -3865,8 +4097,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1164-NEXT: .LBB6_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -3878,9 +4113,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 @@ -3919,8 +4156,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1132-NEXT: .LBB6_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -3936,11 +4176,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX9-DPP-NEXT: s_mov_b32 s33, s2 @@ -3984,8 +4225,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -4003,8 +4246,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 @@ -4049,8 +4293,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -4067,9 +4313,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 @@ -4113,8 +4360,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -4124,11 +4373,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 @@ -4172,8 +4422,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -4185,9 +4438,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 @@ -4226,8 +4481,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -4312,12 +4570,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -4398,8 +4658,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -4481,8 +4743,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -4564,8 +4828,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -4635,8 +4901,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -4703,8 +4972,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -4786,8 +5058,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -4869,8 +5143,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -4952,8 +5228,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -5023,8 +5301,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -5091,8 +5372,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -5107,8 +5391,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5132,10 +5417,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX7LESS-NEXT: .LBB8_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -5144,8 +5431,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -5164,9 +5452,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-NEXT: .LBB8_3: ; GFX9-NEXT: s_endpgm ; @@ -5175,8 +5465,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 @@ -5196,8 +5487,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1064-NEXT: .LBB8_3: ; GFX1064-NEXT: s_endpgm ; @@ -5206,8 +5499,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 @@ -5226,19 +5520,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1032-NEXT: .LBB8_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 @@ -5259,9 +5556,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1164-NEXT: .LBB8_3: ; GFX1164-NEXT: s_endpgm ; @@ -5269,10 +5568,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 @@ -5290,9 +5590,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm ; @@ -5301,8 +5603,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -5321,9 +5624,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-DPP-NEXT: .LBB8_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -5332,8 +5637,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5353,8 +5659,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1064-DPP-NEXT: .LBB8_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -5363,8 +5671,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5383,19 +5692,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5416,9 +5728,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1164-DPP-NEXT: .LBB8_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -5426,10 +5740,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5447,9 +5762,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic @@ -5505,10 +5822,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -5554,9 +5873,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -5603,8 +5924,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1064-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -5651,8 +5974,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1032-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -5690,9 +6015,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; @@ -5728,9 +6055,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; @@ -5776,9 +6105,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -5825,8 +6156,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -5873,8 +6206,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -5912,9 +6247,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; @@ -5950,9 +6287,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -5975,8 +6314,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s33, s2 ; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 @@ -6017,13 +6357,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -6038,11 +6380,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s40, s40, s3 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX9-NEXT: s_mov_b32 s33, s2 @@ -6086,8 +6429,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm ; @@ -6105,8 +6450,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1064-NEXT: s_mov_b32 s33, s2 @@ -6151,8 +6497,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm ; @@ -6169,9 +6517,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1032-NEXT: s_mov_b32 s33, s2 @@ -6215,8 +6564,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm ; @@ -6226,11 +6577,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1164-NEXT: s_mov_b32 s33, s2 @@ -6274,8 +6626,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1164-NEXT: .LBB10_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -6287,9 +6642,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 @@ -6328,8 +6685,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1132-NEXT: .LBB10_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -6345,11 +6705,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX9-DPP-NEXT: s_mov_b32 s33, s2 @@ -6393,8 +6754,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -6412,8 +6775,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 @@ -6458,8 +6822,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -6476,9 +6842,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 @@ -6522,8 +6889,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -6533,11 +6902,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 @@ -6581,8 +6951,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -6594,9 +6967,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 @@ -6635,8 +7010,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -6721,12 +7099,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -6807,8 +7187,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -6890,8 +7272,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -6973,8 +7357,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -7044,8 +7430,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -7112,8 +7501,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -7195,8 +7587,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -7278,8 +7672,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -7361,8 +7757,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -7432,8 +7830,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -7500,8 +7901,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -7516,8 +7920,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -7538,9 +7943,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX7LESS-NEXT: .LBB12_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -7549,8 +7956,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -7567,9 +7975,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX9-NEXT: .LBB12_3: ; GFX9-NEXT: s_endpgm ; @@ -7578,8 +7988,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -7597,8 +8008,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1064-NEXT: .LBB12_3: ; GFX1064-NEXT: s_endpgm ; @@ -7607,8 +8020,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -7625,19 +8039,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1032-NEXT: .LBB12_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -7656,9 +8073,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1164-NEXT: .LBB12_3: ; GFX1164-NEXT: s_endpgm ; @@ -7666,10 +8085,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -7687,9 +8107,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1132-NEXT: .LBB12_3: ; GFX1132-NEXT: s_endpgm ; @@ -7698,8 +8120,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -7716,9 +8139,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX9-DPP-NEXT: .LBB12_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -7727,8 +8152,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7746,8 +8172,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1064-DPP-NEXT: .LBB12_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -7756,8 +8184,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7774,19 +8203,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7805,9 +8237,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1164-DPP-NEXT: .LBB12_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7815,10 +8249,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -7836,9 +8271,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB12_2 ; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1 @@ -7851,8 +8288,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -7873,9 +8311,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX7LESS-NEXT: .LBB13_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -7884,8 +8324,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 @@ -7902,9 +8343,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-NEXT: .LBB13_3: ; GFX9-NEXT: s_endpgm ; @@ -7913,8 +8356,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -7932,8 +8376,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1064-NEXT: .LBB13_3: ; GFX1064-NEXT: s_endpgm ; @@ -7942,8 +8388,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -7960,19 +8407,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -7991,9 +8441,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1164-NEXT: .LBB13_3: ; GFX1164-NEXT: s_endpgm ; @@ -8001,10 +8453,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -8022,9 +8475,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; @@ -8033,8 +8488,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 @@ -8051,9 +8507,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-DPP-NEXT: .LBB13_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -8062,8 +8520,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -8081,8 +8540,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1064-DPP-NEXT: .LBB13_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -8091,8 +8552,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -8109,19 +8571,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -8140,9 +8605,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1164-DPP-NEXT: .LBB13_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -8150,10 +8617,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -8171,9 +8639,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index c5f7980d1e3a9..6bcc36c19b491 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -22,8 +22,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -46,9 +47,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX7LESS-NEXT: .LBB0_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -58,8 +61,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -78,9 +82,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-NEXT: .LBB0_3: ; GFX9-NEXT: s_endpgm ; @@ -90,8 +96,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -111,8 +118,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1064-NEXT: .LBB0_3: ; GFX1064-NEXT: s_endpgm ; @@ -122,8 +131,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -142,20 +152,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -177,9 +191,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1164-NEXT: .LBB0_3: ; GFX1164-NEXT: s_endpgm ; @@ -188,10 +204,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -211,9 +228,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_endpgm ; @@ -223,8 +242,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] @@ -243,9 +263,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-DPP-NEXT: .LBB0_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -255,8 +277,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -276,8 +299,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1064-DPP-NEXT: .LBB0_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -287,8 +312,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -307,20 +333,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -342,9 +372,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1164-DPP-NEXT: .LBB0_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -353,10 +385,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB0_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 @@ -376,9 +409,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX1132-DPP-NEXT: .LBB0_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 @@ -430,9 +465,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -479,9 +516,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -496,9 +534,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX9-NEXT: .LBB1_5: ; GFX9-NEXT: s_endpgm ; @@ -545,9 +585,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB1_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -563,8 +604,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1064-NEXT: .LBB1_5: ; GFX1064-NEXT: s_endpgm ; @@ -611,9 +654,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB1_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -628,8 +672,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1032-NEXT: .LBB1_5: ; GFX1032-NEXT: s_endpgm ; @@ -667,12 +713,13 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB1_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -688,9 +735,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1164-NEXT: .LBB1_5: ; GFX1164-NEXT: s_endpgm ; @@ -729,11 +778,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB1_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -748,9 +798,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB1_4 ; GFX1132-NEXT: .LBB1_5: ; GFX1132-NEXT: s_endpgm ; @@ -816,8 +868,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -832,9 +885,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-DPP-NEXT: .LBB1_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -888,18 +943,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -915,8 +973,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1064-DPP-NEXT: .LBB1_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -969,14 +1029,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -991,8 +1054,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1032-DPP-NEXT: .LBB1_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -1042,21 +1107,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -1072,9 +1140,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1164-DPP-NEXT: .LBB1_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -1122,16 +1192,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB1_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -1146,9 +1219,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1132-DPP-NEXT: .LBB1_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -1159,18 +1234,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 -; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -1197,9 +1273,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7LESS-NEXT: .LBB2_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -1215,8 +1293,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1239,9 +1318,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-NEXT: .LBB2_3: ; GFX9-NEXT: s_endpgm ; @@ -1257,8 +1338,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 @@ -1280,8 +1362,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1064-NEXT: .LBB2_3: ; GFX1064-NEXT: s_endpgm ; @@ -1297,8 +1381,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 @@ -1319,8 +1404,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; @@ -1330,15 +1417,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1361,9 +1449,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1164-NEXT: .LBB2_3: ; GFX1164-NEXT: s_endpgm ; @@ -1374,13 +1464,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1401,9 +1492,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; @@ -1419,8 +1512,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1443,9 +1537,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-DPP-NEXT: .LBB2_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -1461,8 +1557,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 @@ -1484,8 +1581,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1064-DPP-NEXT: .LBB2_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -1501,8 +1600,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -1523,8 +1623,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -1534,15 +1636,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1565,9 +1668,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1164-DPP-NEXT: .LBB2_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -1578,13 +1683,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -1605,9 +1711,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic @@ -1660,9 +1768,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -1709,9 +1819,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -1726,9 +1837,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX9-NEXT: .LBB3_5: ; GFX9-NEXT: s_endpgm ; @@ -1775,9 +1888,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -1793,8 +1907,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1064-NEXT: .LBB3_5: ; GFX1064-NEXT: s_endpgm ; @@ -1841,9 +1957,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -1858,8 +1975,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1032-NEXT: .LBB3_5: ; GFX1032-NEXT: s_endpgm ; @@ -1897,12 +2016,13 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -1918,9 +2038,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1164-NEXT: .LBB3_5: ; GFX1164-NEXT: s_endpgm ; @@ -1959,11 +2081,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB3_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -1978,9 +2101,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_4 ; GFX1132-NEXT: .LBB3_5: ; GFX1132-NEXT: s_endpgm ; @@ -2046,8 +2171,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -2062,9 +2188,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX9-DPP-NEXT: .LBB3_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2118,18 +2246,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2145,8 +2276,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1064-DPP-NEXT: .LBB3_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -2199,14 +2332,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2221,8 +2357,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1032-DPP-NEXT: .LBB3_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -2272,21 +2410,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2302,9 +2443,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1164-DPP-NEXT: .LBB3_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2352,16 +2495,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -2376,9 +2522,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX1132-DPP-NEXT: .LBB3_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() strictfp @@ -2389,18 +2537,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 -; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -2427,9 +2576,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX7LESS-NEXT: .LBB4_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -2445,8 +2596,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -2469,9 +2621,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-NEXT: .LBB4_3: ; GFX9-NEXT: s_endpgm ; @@ -2487,8 +2641,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 @@ -2510,8 +2665,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1064-NEXT: .LBB4_3: ; GFX1064-NEXT: s_endpgm ; @@ -2527,8 +2684,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 @@ -2549,8 +2707,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; @@ -2560,15 +2720,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2591,9 +2752,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1164-NEXT: .LBB4_3: ; GFX1164-NEXT: s_endpgm ; @@ -2604,13 +2767,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2631,9 +2795,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; @@ -2649,8 +2815,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -2673,9 +2840,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX9-DPP-NEXT: .LBB4_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -2691,8 +2860,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 @@ -2714,8 +2884,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1064-DPP-NEXT: .LBB4_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -2731,8 +2903,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -2753,8 +2926,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -2764,15 +2939,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2795,9 +2971,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1164-DPP-NEXT: .LBB4_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -2808,13 +2986,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -2835,9 +3014,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic @@ -2890,9 +3071,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -2939,9 +3122,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -2956,9 +3140,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9-NEXT: .LBB5_5: ; GFX9-NEXT: s_endpgm ; @@ -3005,9 +3191,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -3023,8 +3210,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064-NEXT: .LBB5_5: ; GFX1064-NEXT: s_endpgm ; @@ -3071,9 +3260,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -3088,8 +3278,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032-NEXT: .LBB5_5: ; GFX1032-NEXT: s_endpgm ; @@ -3127,12 +3319,13 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB5_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -3148,9 +3341,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164-NEXT: .LBB5_5: ; GFX1164-NEXT: s_endpgm ; @@ -3189,11 +3384,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB5_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB5_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -3208,9 +3404,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132-NEXT: .LBB5_5: ; GFX1132-NEXT: s_endpgm ; @@ -3276,8 +3474,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -3292,9 +3491,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX9-DPP-NEXT: .LBB5_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -3348,18 +3549,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3375,8 +3579,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1064-DPP-NEXT: .LBB5_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -3429,14 +3635,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3451,8 +3660,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1032-DPP-NEXT: .LBB5_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -3502,21 +3713,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3532,9 +3746,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1164-DPP-NEXT: .LBB5_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -3582,16 +3798,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB5_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -3606,9 +3825,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB5_2 ; GFX1132-DPP-NEXT: .LBB5_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() @@ -3662,9 +3883,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -3711,9 +3934,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB6_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -3728,9 +3952,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB6_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX9-NEXT: .LBB6_5: ; GFX9-NEXT: s_endpgm ; @@ -3777,9 +4003,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB6_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -3795,8 +4022,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX1064-NEXT: .LBB6_5: ; GFX1064-NEXT: s_endpgm ; @@ -3843,9 +4072,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB6_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -3860,8 +4090,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX1032-NEXT: .LBB6_5: ; GFX1032-NEXT: s_endpgm ; @@ -3899,12 +4131,13 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB6_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -3920,9 +4153,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX1164-NEXT: .LBB6_5: ; GFX1164-NEXT: s_endpgm ; @@ -3961,11 +4196,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB6_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB6_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -3980,9 +4216,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB6_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB6_4 ; GFX1132-NEXT: .LBB6_5: ; GFX1132-NEXT: s_endpgm ; @@ -4048,8 +4286,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -4064,9 +4303,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -4120,18 +4361,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -4147,8 +4391,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -4201,14 +4447,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -4223,8 +4472,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -4274,21 +4525,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -4304,9 +4558,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -4354,16 +4610,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -4378,9 +4637,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() strictfp @@ -4391,18 +4652,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 -; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -4429,9 +4691,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX7LESS-NEXT: .LBB7_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -4447,8 +4711,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -4471,9 +4736,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX9-NEXT: .LBB7_3: ; GFX9-NEXT: s_endpgm ; @@ -4489,8 +4756,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 @@ -4512,8 +4780,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1064-NEXT: .LBB7_3: ; GFX1064-NEXT: s_endpgm ; @@ -4529,8 +4799,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 @@ -4551,8 +4822,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm ; @@ -4562,15 +4835,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -4593,9 +4867,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1164-NEXT: .LBB7_3: ; GFX1164-NEXT: s_endpgm ; @@ -4606,13 +4882,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -4633,9 +4910,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; @@ -4651,8 +4930,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -4675,9 +4955,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -4693,8 +4975,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 @@ -4716,8 +4999,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -4733,8 +5018,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -4755,8 +5041,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -4766,15 +5054,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -4797,9 +5086,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -4810,13 +5101,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -4837,9 +5129,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 @@ -4891,9 +5185,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -4940,9 +5236,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-NEXT: s_cbranch_execz .LBB8_5 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -4957,9 +5254,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB8_4 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX9-NEXT: .LBB8_5: ; GFX9-NEXT: s_endpgm ; @@ -5006,9 +5305,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB8_5 +; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 @@ -5024,8 +5324,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1064-NEXT: .LBB8_5: ; GFX1064-NEXT: s_endpgm ; @@ -5072,9 +5374,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB8_5 +; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 @@ -5089,8 +5392,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1032-NEXT: .LBB8_5: ; GFX1032-NEXT: s_endpgm ; @@ -5128,12 +5433,13 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execz .LBB8_5 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 @@ -5149,9 +5455,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1164-NEXT: .LBB8_5: ; GFX1164-NEXT: s_endpgm ; @@ -5190,11 +5498,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execz .LBB8_5 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB8_5 ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 @@ -5209,9 +5518,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB8_4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_4 ; GFX1132-NEXT: .LBB8_5: ; GFX1132-NEXT: s_endpgm ; @@ -5277,8 +5588,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -5293,9 +5605,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[6:7], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-DPP-NEXT: .LBB8_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -5349,18 +5663,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5376,8 +5693,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1064-DPP-NEXT: .LBB8_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -5430,14 +5749,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 -; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5452,8 +5774,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -5503,21 +5827,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5533,9 +5860,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1164-DPP-NEXT: .LBB8_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -5583,16 +5912,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 @@ -5607,9 +5939,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm %divValue = call float @div.float.value() strictfp @@ -5634,8 +5968,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5677,13 +6012,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -5700,11 +6037,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_add_u32 s40, s40, s3 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_mov_b32 s33, s2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -5749,8 +6087,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; @@ -5770,8 +6110,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 @@ -5817,8 +6158,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; @@ -5837,9 +6180,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 ; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 @@ -5884,8 +6228,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm ; @@ -5897,11 +6243,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 @@ -5947,8 +6294,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1164-NEXT: .LBB9_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -5961,9 +6311,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2 ; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 @@ -6004,8 +6356,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1132-NEXT: .LBB9_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -6023,11 +6378,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-DPP-NEXT: s_mov_b32 s33, s2 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -6072,8 +6428,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -6093,8 +6451,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 @@ -6140,8 +6499,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -6160,9 +6521,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 @@ -6207,8 +6569,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -6220,11 +6584,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 @@ -6270,8 +6635,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -6284,9 +6652,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 ; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 @@ -6327,8 +6697,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -6413,13 +6786,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -6500,8 +6875,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -6583,8 +6960,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -6666,8 +7045,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -6738,8 +7119,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -6804,8 +7188,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -6887,8 +7274,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -6970,8 +7359,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -7053,8 +7444,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -7125,8 +7518,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -7191,8 +7587,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -7214,8 +7613,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] @@ -7244,10 +7644,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX7LESS-NEXT: .LBB11_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -7263,8 +7665,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -7288,9 +7691,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX9-NEXT: .LBB11_3: ; GFX9-NEXT: s_endpgm ; @@ -7306,8 +7711,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 @@ -7330,8 +7736,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1064-NEXT: .LBB11_3: ; GFX1064-NEXT: s_endpgm ; @@ -7347,8 +7755,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 @@ -7370,8 +7779,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1032-NEXT: .LBB11_3: ; GFX1032-NEXT: s_endpgm ; @@ -7381,15 +7792,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -7413,9 +7825,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1164-NEXT: .LBB11_3: ; GFX1164-NEXT: s_endpgm ; @@ -7426,13 +7840,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -7453,9 +7868,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm ; @@ -7471,8 +7888,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -7496,9 +7914,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -7514,8 +7934,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 @@ -7538,8 +7959,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -7555,8 +7978,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -7578,8 +8002,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -7589,15 +8015,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -7621,9 +8048,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -7634,13 +8063,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -7661,9 +8091,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic @@ -7716,10 +8148,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -7763,9 +8197,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -7810,8 +8246,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: v_mov_b32_e32 v5, v3 ; GFX1064-NEXT: v_mov_b32_e32 v4, v2 ; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1064-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -7856,8 +8294,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: v_mov_b32_e32 v5, v3 ; GFX1032-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1032-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -7892,9 +8332,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; @@ -7927,9 +8369,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; @@ -7973,9 +8417,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -8020,8 +8466,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -8066,8 +8514,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -8102,9 +8552,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; @@ -8137,9 +8589,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() strictfp @@ -8160,8 +8614,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] @@ -8190,10 +8645,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7LESS-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX7LESS-NEXT: .LBB13_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -8209,8 +8666,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -8234,9 +8692,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-NEXT: .LBB13_3: ; GFX9-NEXT: s_endpgm ; @@ -8252,8 +8712,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 @@ -8276,8 +8737,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1064-NEXT: .LBB13_3: ; GFX1064-NEXT: s_endpgm ; @@ -8293,8 +8756,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 @@ -8316,8 +8780,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm ; @@ -8327,15 +8793,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8359,9 +8826,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1164-NEXT: .LBB13_3: ; GFX1164-NEXT: s_endpgm ; @@ -8372,13 +8841,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, 0 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8399,9 +8869,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm ; @@ -8417,8 +8889,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -8442,9 +8915,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX9-DPP-NEXT: .LBB13_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -8460,8 +8935,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 @@ -8484,8 +8960,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1064-DPP-NEXT: .LBB13_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -8501,8 +8979,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 ; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 @@ -8524,8 +9003,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -8535,15 +9016,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8567,9 +9049,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[4:5], exec, s[2:3] +; GFX1164-DPP-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1164-DPP-NEXT: .LBB13_3: ; GFX1164-DPP-NEXT: s_endpgm ; @@ -8580,13 +9064,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1132-DPP-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -8607,9 +9092,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s3, exec_lo, s2 +; GFX1132-DPP-NEXT: s_and_b32 s4, s3, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic @@ -8663,10 +9150,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -8710,9 +9199,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -8757,8 +9248,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v5, v3 ; GFX1064-NEXT: v_mov_b32_e32 v4, v2 ; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1064-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -8803,8 +9296,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v5, v3 ; GFX1032-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1032-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -8839,9 +9334,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; @@ -8874,9 +9371,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; @@ -8920,9 +9419,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -8967,8 +9468,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -9013,8 +9516,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -9049,9 +9554,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; @@ -9084,9 +9591,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.double.value() @@ -9141,10 +9650,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX7LESS-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -9188,9 +9699,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -9235,8 +9748,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: v_mov_b32_e32 v5, v3 ; GFX1064-NEXT: v_mov_b32_e32 v4, v2 ; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1064-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -9281,8 +9796,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v5, v3 ; GFX1032-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1032-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -9317,9 +9834,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_endpgm ; @@ -9352,9 +9871,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_endpgm ; @@ -9398,9 +9919,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -9445,8 +9968,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX1064-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -9491,8 +10016,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX1032-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -9527,9 +10054,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1] +; GFX1164-DPP-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_endpgm ; @@ -9562,9 +10091,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX1132-DPP-NEXT: s_and_b32 s2, s1, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_endpgm %divValue = call double @div.float.value() strictfp @@ -9588,8 +10119,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX7LESS-NEXT: s_cmov_b64 exec, vcc +; GFX7LESS-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -9634,13 +10166,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; @@ -9657,11 +10191,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-NEXT: s_add_u32 s40, s40, s3 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_mov_b32 s33, s2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -9709,8 +10244,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; @@ -9730,8 +10267,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 @@ -9778,8 +10316,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; @@ -9798,9 +10338,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 @@ -9846,8 +10387,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; @@ -9865,10 +10408,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-NEXT: s_mov_b32 s32, 32 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-NEXT: s_cmov_b64 exec, vcc +; GFX1164-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -9915,8 +10459,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1164-NEXT: .LBB16_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -9933,11 +10480,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -9978,8 +10526,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1132-NEXT: .LBB16_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -9997,11 +10548,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-DPP-NEXT: s_mov_b32 s33, s2 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX9-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -10049,8 +10601,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; @@ -10070,8 +10624,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1064-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1064-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -10118,8 +10673,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[38:39] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; @@ -10138,9 +10695,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -10186,8 +10744,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s38 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; @@ -10205,10 +10765,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1164-DPP-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1164-DPP-NEXT: s_cmov_b64 exec, vcc +; GFX1164-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -10255,8 +10816,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[38:39] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -10273,11 +10837,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 -; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-DPP-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1132-DPP-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132-DPP-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] @@ -10318,8 +10883,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s38 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s38 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm @@ -10404,13 +10972,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] -; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-NEXT: s_andn2_b64 s[0:1], exec, s[42:43] +; GFX7LESS-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX7LESS-NEXT: s_cselect_b64 exec, s[0:1], s[42:43] +; GFX7LESS-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-NEXT: s_endpgm ; @@ -10491,8 +11061,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; @@ -10574,8 +11146,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-NEXT: s_endpgm ; @@ -10657,8 +11231,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-NEXT: s_endpgm ; @@ -10729,8 +11305,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-NEXT: s_endpgm @@ -10795,8 +11374,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-NEXT: s_endpgm @@ -10878,8 +11460,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX9-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX9-DPP-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-DPP-NEXT: s_endpgm ; @@ -10961,8 +11545,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] -; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-DPP-NEXT: s_andn2_b64 s[0:1], exec, s[44:45] +; GFX1064-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1064-DPP-NEXT: s_endpgm ; @@ -11044,8 +11630,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 -; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-DPP-NEXT: s_andn2_b32 s0, exec_lo, s44 +; GFX1032-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1032-DPP-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1032-DPP-NEXT: s_endpgm ; @@ -11116,8 +11704,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] -; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-DPP-NEXT: s_and_not1_b64 s[0:1], exec, s[44:45] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1164-DPP-NEXT: s_cselect_b64 exec, s[0:1], s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1164-DPP-NEXT: s_endpgm @@ -11182,8 +11773,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 -; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-DPP-NEXT: s_and_not1_b32 s0, exec_lo, s44 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_b32 s1, s0, -1 +; GFX1132-DPP-NEXT: s_cselect_b32 exec_lo, s0, s44 +; GFX1132-DPP-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1132-DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/hoist-cond.ll b/llvm/test/CodeGen/AMDGPU/hoist-cond.ll index 830a40ff052ac..a0201778f00ec 100644 --- a/llvm/test/CodeGen/AMDGPU/hoist-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/hoist-cond.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck %s ; Check that invariant compare is hoisted out of the loop. diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index de484677bf5e6..bc84872c2914b 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -1,4 +1,3 @@ - ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck --check-prefix=HSA-CI %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA %s diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll index f34f9f38feeb4..1e78ca4be7d7f 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -18,8 +18,10 @@ define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid) ; SI-NEXT: s_andn2_b64 s[6:7], s[6:7], exec ; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec ; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB0_7 +; SI-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] +; SI-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SI-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; SI-NEXT: s_cbranch_scc0 .LBB0_6 ; SI-NEXT: .LBB0_3: ; %for.body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_cmp_lt_u32 s14, 4 @@ -29,28 +31,30 @@ define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid) ; SI-NEXT: s_cbranch_scc1 .LBB0_1 ; SI-NEXT: ; %bb.4: ; %mid.loop ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: v_mov_b32_e32 v1, s14 ; SI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 idxen offen -; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 +; SI-NEXT: s_mov_b64 s[10:11], -1 +; SI-NEXT: s_and_b64 s[8:9], vcc, -1 ; SI-NEXT: s_mov_b64 s[8:9], -1 -; SI-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB0_2 ; SI-NEXT: ; %bb.5: ; %end.loop ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; SI-NEXT: s_add_i32 s14, s14, 1 ; SI-NEXT: s_xor_b64 s[8:9], exec, -1 -; SI-NEXT: ; %bb.6: ; %Flow1 -; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[12:13] ; SI-NEXT: s_branch .LBB0_2 -; SI-NEXT: .LBB0_7: ; %for.end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_and_saveexec_b64 s[0:1], s[6:7] -; SI-NEXT: s_cbranch_execz .LBB0_9 -; SI-NEXT: ; %bb.8: ; %if +; SI-NEXT: .LBB0_6: ; %for.end +; SI-NEXT: s_and_b64 s[0:1], s[6:7], exec +; SI-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; SI-NEXT: s_cmov_b64 exec, s[0:1] +; SI-NEXT: s_cbranch_scc0 .LBB0_8 +; SI-NEXT: ; %bb.7: ; %if ; SI-NEXT: exp mrt0 v0, v0, v0, v0 done vm -; SI-NEXT: .LBB0_9: ; %end +; SI-NEXT: .LBB0_8: ; %end ; SI-NEXT: s_endpgm entry: br label %for.body diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll index 80aa6ee0ab103..d9cc8aff67a84 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s diff --git a/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir b/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir index ac0931b6022f1..4980fb5ab39ee 100644 --- a/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir +++ b/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir @@ -33,10 +33,9 @@ body: | ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI %15, %bb.6 - ; GCN-NEXT: SI_END_CF [[PHI]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: @@ -45,33 +44,32 @@ body: | ; GCN-NEXT: ATOMIC_FENCE 5, 2 ; GCN-NEXT: S_BARRIER ; GCN-NEXT: ATOMIC_FENCE 4, 2 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY %18 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY %16 ; GCN-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[COPY6]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.7(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.7 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.3(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, [[COPY5]], %bb.2 - ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, [[COPY5]], %bb.2 ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.6: ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[PHI2:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_1]], %bb.1, %15, %bb.6 + ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_1]], %bb.1, %21, %bb.6 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY4]] - ; GCN-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK [[COPY7]], [[PHI2]], implicit-def dead $scc + ; GCN-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK [[COPY7]], [[PHI1]], implicit-def dead $scc ; GCN-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.7: - ; GCN-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.5 @@ -89,7 +87,6 @@ body: | S_BRANCH %bb.1 bb.1: - ; predecessors: %bb.0 successors: %bb.6 %10:sreg_32 = S_MOV_B32 16 @@ -100,17 +97,14 @@ body: | S_BRANCH %bb.6 bb.2: - ; predecessors: %bb.6 successors: %bb.5 - %20:sreg_64 = PHI %6:sreg_64, %bb.6 - SI_END_CF %20:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %15:sreg_64 = S_MOV_B64 -1 %21:vreg_1 = COPY %15:sreg_64, implicit $exec + SI_WAVE_RECONVERGE %16:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.5 bb.3: - ; predecessors: %bb.5 successors: %bb.4, %bb.7 %22:vreg_1 = PHI %7:vreg_1, %bb.5 @@ -122,21 +116,18 @@ body: | S_BRANCH %bb.4 bb.4: - ; predecessors: %bb.3 successors: %bb.7 + SI_WAVE_RECONVERGE %24:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.7 bb.5: - ; predecessors: %bb.0, %bb.2 successors: %bb.3 %7:vreg_1 = PHI %17:vreg_1, %bb.0, %21:vreg_1, %bb.2 - SI_END_CF %16:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 bb.6: - ; predecessors: %bb.1, %bb.6 successors: %bb.2, %bb.6 %5:sreg_64 = PHI %12:sreg_64, %bb.1, %6:sreg_64, %bb.6 @@ -146,9 +137,7 @@ body: | S_BRANCH %bb.2 bb.7: - ; predecessors: %bb.3, %bb.4 - SI_END_CF %24:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 297b5180dfe9b..eaea28a9f64f6 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -6,374 +6,314 @@ define void @main(i1 %arg) #0 { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v8, s30, 0 -; CHECK-NEXT: v_writelane_b32 v8, s31, 1 -; CHECK-NEXT: v_writelane_b32 v8, s36, 2 -; CHECK-NEXT: v_writelane_b32 v8, s37, 3 -; CHECK-NEXT: v_writelane_b32 v8, s38, 4 -; CHECK-NEXT: v_writelane_b32 v8, s39, 5 -; CHECK-NEXT: v_writelane_b32 v8, s40, 6 -; CHECK-NEXT: v_writelane_b32 v8, s41, 7 -; CHECK-NEXT: v_writelane_b32 v8, s42, 8 -; CHECK-NEXT: v_writelane_b32 v8, s43, 9 -; CHECK-NEXT: v_writelane_b32 v8, s44, 10 -; CHECK-NEXT: v_writelane_b32 v8, s45, 11 -; CHECK-NEXT: v_writelane_b32 v8, s46, 12 -; CHECK-NEXT: v_writelane_b32 v8, s47, 13 -; CHECK-NEXT: v_writelane_b32 v8, s48, 14 -; CHECK-NEXT: v_writelane_b32 v8, s49, 15 +; CHECK-NEXT: v_writelane_b32 v7, s30, 0 +; CHECK-NEXT: v_writelane_b32 v7, s31, 1 +; CHECK-NEXT: v_writelane_b32 v7, s34, 2 +; CHECK-NEXT: v_writelane_b32 v7, s35, 3 +; CHECK-NEXT: v_writelane_b32 v7, s36, 4 +; CHECK-NEXT: v_writelane_b32 v7, s37, 5 +; CHECK-NEXT: v_writelane_b32 v7, s38, 6 +; CHECK-NEXT: v_writelane_b32 v7, s39, 7 +; CHECK-NEXT: v_writelane_b32 v7, s40, 8 +; CHECK-NEXT: v_writelane_b32 v7, s41, 9 +; CHECK-NEXT: v_writelane_b32 v7, s42, 10 +; CHECK-NEXT: v_writelane_b32 v7, s43, 11 +; CHECK-NEXT: v_writelane_b32 v7, s44, 12 +; CHECK-NEXT: v_writelane_b32 v7, s45, 13 +; CHECK-NEXT: v_writelane_b32 v7, s46, 14 +; CHECK-NEXT: v_writelane_b32 v7, s47, 15 +; CHECK-NEXT: v_writelane_b32 v7, s48, 16 +; CHECK-NEXT: v_writelane_b32 v7, s49, 17 ; CHECK-NEXT: s_getpc_b64 s[24:25] -; CHECK-NEXT: v_writelane_b32 v8, s50, 16 +; CHECK-NEXT: v_writelane_b32 v7, s50, 18 ; CHECK-NEXT: s_movk_i32 s4, 0xf0 ; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v8, s51, 17 +; CHECK-NEXT: v_writelane_b32 v7, s51, 19 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0 ; CHECK-NEXT: s_movk_i32 s4, 0x130 ; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s36, 0 -; CHECK-NEXT: v_writelane_b32 v4, s37, 1 -; CHECK-NEXT: v_writelane_b32 v4, s38, 2 -; CHECK-NEXT: v_writelane_b32 v4, s39, 3 -; CHECK-NEXT: v_writelane_b32 v4, s40, 4 -; CHECK-NEXT: v_writelane_b32 v4, s41, 5 -; CHECK-NEXT: v_writelane_b32 v4, s42, 6 -; CHECK-NEXT: v_writelane_b32 v4, s43, 7 -; CHECK-NEXT: v_writelane_b32 v4, s44, 8 -; CHECK-NEXT: v_writelane_b32 v4, s45, 9 -; CHECK-NEXT: v_writelane_b32 v4, s46, 10 ; CHECK-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v4, s47, 11 -; CHECK-NEXT: v_writelane_b32 v4, s48, 12 -; CHECK-NEXT: v_writelane_b32 v4, s49, 13 ; CHECK-NEXT: s_mov_b32 s20, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_writelane_b32 v4, s50, 14 -; CHECK-NEXT: v_mov_b32_e32 v5, s28 -; CHECK-NEXT: v_mov_b32_e32 v6, v1 +; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v4, s28 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 ; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_writelane_b32 v4, s51, 15 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 -; CHECK-NEXT: image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s4, 16 -; CHECK-NEXT: v_writelane_b32 v4, s5, 17 -; CHECK-NEXT: v_writelane_b32 v4, s6, 18 -; CHECK-NEXT: v_writelane_b32 v4, s7, 19 -; CHECK-NEXT: v_writelane_b32 v4, s8, 20 -; CHECK-NEXT: v_writelane_b32 v4, s9, 21 -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[4:11], s[20:23] dmask:0x1 -; CHECK-NEXT: v_writelane_b32 v4, s10, 22 -; CHECK-NEXT: v_writelane_b32 v4, s11, 23 -; CHECK-NEXT: v_writelane_b32 v4, s12, 24 -; CHECK-NEXT: v_writelane_b32 v4, s13, 25 -; CHECK-NEXT: v_writelane_b32 v4, s14, 26 -; CHECK-NEXT: v_writelane_b32 v4, s15, 27 -; CHECK-NEXT: v_writelane_b32 v4, s16, 28 -; CHECK-NEXT: v_writelane_b32 v8, s52, 18 -; CHECK-NEXT: v_writelane_b32 v4, s17, 29 -; CHECK-NEXT: v_writelane_b32 v8, s53, 19 -; CHECK-NEXT: v_writelane_b32 v4, s18, 30 -; CHECK-NEXT: v_writelane_b32 v8, s54, 20 -; CHECK-NEXT: v_writelane_b32 v4, s19, 31 +; CHECK-NEXT: v_writelane_b32 v3, s36, 0 +; CHECK-NEXT: v_writelane_b32 v7, s52, 20 +; CHECK-NEXT: v_writelane_b32 v7, s53, 21 +; CHECK-NEXT: v_writelane_b32 v3, s37, 1 +; CHECK-NEXT: v_writelane_b32 v7, s54, 22 +; CHECK-NEXT: v_writelane_b32 v3, s38, 2 +; CHECK-NEXT: image_sample_lz v4, v[4:5], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: v_writelane_b32 v7, s55, 23 +; CHECK-NEXT: image_sample_lz v5, v[1:2], s[4:11], s[20:23] dmask:0x1 +; CHECK-NEXT: v_writelane_b32 v3, s39, 3 +; CHECK-NEXT: v_writelane_b32 v7, s56, 24 +; CHECK-NEXT: v_writelane_b32 v3, s40, 4 +; CHECK-NEXT: v_writelane_b32 v7, s57, 25 +; CHECK-NEXT: v_writelane_b32 v3, s41, 5 +; CHECK-NEXT: v_writelane_b32 v7, s58, 26 +; CHECK-NEXT: v_writelane_b32 v3, s42, 6 +; CHECK-NEXT: v_writelane_b32 v7, s59, 27 +; CHECK-NEXT: v_writelane_b32 v3, s43, 7 +; CHECK-NEXT: v_writelane_b32 v7, s60, 28 +; CHECK-NEXT: v_writelane_b32 v3, s44, 8 +; CHECK-NEXT: v_writelane_b32 v7, s61, 29 +; CHECK-NEXT: v_writelane_b32 v3, s45, 9 +; CHECK-NEXT: v_writelane_b32 v7, s62, 30 +; CHECK-NEXT: v_writelane_b32 v3, s46, 10 +; CHECK-NEXT: v_writelane_b32 v7, s63, 31 +; CHECK-NEXT: v_writelane_b32 v3, s47, 11 +; CHECK-NEXT: v_writelane_b32 v7, s64, 32 +; CHECK-NEXT: v_writelane_b32 v3, s48, 12 +; CHECK-NEXT: v_writelane_b32 v7, s65, 33 +; CHECK-NEXT: v_writelane_b32 v3, s49, 13 +; CHECK-NEXT: v_writelane_b32 v7, s66, 34 +; CHECK-NEXT: v_writelane_b32 v3, s50, 14 ; CHECK-NEXT: s_mov_b32 s4, 48 +; CHECK-NEXT: s_movk_i32 s28, 0x1f0 ; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v8, s55, 21 -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v8, s56, 22 -; CHECK-NEXT: v_writelane_b32 v8, s57, 23 -; CHECK-NEXT: v_writelane_b32 v8, s58, 24 -; CHECK-NEXT: v_writelane_b32 v8, s59, 25 -; CHECK-NEXT: v_writelane_b32 v8, s60, 26 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s4, 32 -; CHECK-NEXT: v_writelane_b32 v8, s61, 27 -; CHECK-NEXT: v_writelane_b32 v4, s5, 33 -; CHECK-NEXT: v_writelane_b32 v8, s62, 28 -; CHECK-NEXT: v_writelane_b32 v4, s6, 34 -; CHECK-NEXT: v_writelane_b32 v8, s63, 29 -; CHECK-NEXT: v_writelane_b32 v4, s7, 35 -; CHECK-NEXT: v_writelane_b32 v8, s64, 30 -; CHECK-NEXT: v_writelane_b32 v4, s8, 36 -; CHECK-NEXT: v_writelane_b32 v8, s65, 31 -; CHECK-NEXT: v_writelane_b32 v4, s9, 37 -; CHECK-NEXT: v_writelane_b32 v8, s66, 32 -; CHECK-NEXT: s_movk_i32 s26, 0x1f0 -; CHECK-NEXT: s_movk_i32 s28, 0x2f0 -; CHECK-NEXT: s_mov_b32 s27, s24 ; CHECK-NEXT: s_mov_b32 s29, s24 -; CHECK-NEXT: v_writelane_b32 v4, s10, 38 -; CHECK-NEXT: v_writelane_b32 v8, s67, 33 -; CHECK-NEXT: v_writelane_b32 v4, s11, 39 -; CHECK-NEXT: s_load_dwordx16 s[52:67], s[26:27], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[4:19], s[28:29], 0x0 +; CHECK-NEXT: v_writelane_b32 v7, s67, 35 +; CHECK-NEXT: v_writelane_b32 v3, s51, 15 +; CHECK-NEXT: s_movk_i32 s30, 0x2f0 +; CHECK-NEXT: s_mov_b32 s31, s24 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_load_dwordx16 s[52:67], s[28:29], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[36:51], s[30:31], 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1 -; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; CHECK-NEXT: s_and_b64 vcc, s[24:25], exec +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_writelane_b32 v3, s36, 16 +; CHECK-NEXT: v_writelane_b32 v3, s37, 17 +; CHECK-NEXT: v_writelane_b32 v3, s38, 18 +; CHECK-NEXT: v_writelane_b32 v3, s39, 19 +; CHECK-NEXT: v_writelane_b32 v3, s40, 20 +; CHECK-NEXT: v_writelane_b32 v3, s41, 21 +; CHECK-NEXT: v_writelane_b32 v3, s42, 22 +; CHECK-NEXT: v_writelane_b32 v3, s43, 23 +; CHECK-NEXT: v_writelane_b32 v3, s44, 24 +; CHECK-NEXT: v_writelane_b32 v3, s45, 25 +; CHECK-NEXT: v_writelane_b32 v3, s46, 26 +; CHECK-NEXT: v_writelane_b32 v3, s47, 27 +; CHECK-NEXT: v_writelane_b32 v3, s48, 28 +; CHECK-NEXT: v_writelane_b32 v3, s49, 29 +; CHECK-NEXT: s_xor_b64 s[26:27], vcc, exec +; CHECK-NEXT: v_writelane_b32 v3, s50, 30 +; CHECK-NEXT: s_and_b64 s[34:35], vcc, -1 +; CHECK-NEXT: v_writelane_b32 v3, s51, 31 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mul_f32_e32 v0, v6, v5 -; CHECK-NEXT: s_and_saveexec_b64 s[26:27], s[24:25] -; CHECK-NEXT: s_xor_b64 s[26:27], exec, s[26:27] -; CHECK-NEXT: s_cbranch_execz .LBB0_3 +; CHECK-NEXT: v_mul_f32_e32 v0, v5, v4 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: v_readlane_b32 s36, v4, 0 -; CHECK-NEXT: v_readlane_b32 s44, v4, 8 -; CHECK-NEXT: v_readlane_b32 s45, v4, 9 -; CHECK-NEXT: v_readlane_b32 s46, v4, 10 -; CHECK-NEXT: v_readlane_b32 s47, v4, 11 -; CHECK-NEXT: v_readlane_b32 s48, v4, 12 -; CHECK-NEXT: v_readlane_b32 s49, v4, 13 -; CHECK-NEXT: v_readlane_b32 s50, v4, 14 -; CHECK-NEXT: v_readlane_b32 s51, v4, 15 -; CHECK-NEXT: s_and_b64 vcc, exec, -1 -; CHECK-NEXT: v_readlane_b32 s37, v4, 1 -; CHECK-NEXT: v_readlane_b32 s38, v4, 2 -; CHECK-NEXT: v_readlane_b32 s39, v4, 3 -; CHECK-NEXT: v_readlane_b32 s40, v4, 4 -; CHECK-NEXT: image_sample_lz v5, v[1:2], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s36, v3, 0 +; CHECK-NEXT: v_readlane_b32 s44, v3, 8 +; CHECK-NEXT: v_readlane_b32 s45, v3, 9 +; CHECK-NEXT: v_readlane_b32 s46, v3, 10 +; CHECK-NEXT: v_readlane_b32 s47, v3, 11 +; CHECK-NEXT: v_readlane_b32 s48, v3, 12 +; CHECK-NEXT: v_readlane_b32 s49, v3, 13 +; CHECK-NEXT: v_readlane_b32 s50, v3, 14 +; CHECK-NEXT: v_readlane_b32 s51, v3, 15 +; CHECK-NEXT: v_readlane_b32 s37, v3, 1 +; CHECK-NEXT: v_readlane_b32 s38, v3, 2 +; CHECK-NEXT: v_readlane_b32 s39, v3, 3 +; CHECK-NEXT: v_readlane_b32 s40, v3, 4 +; CHECK-NEXT: v_readlane_b32 s41, v3, 5 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s42, v3, 6 +; CHECK-NEXT: v_readlane_b32 s43, v3, 7 +; CHECK-NEXT: v_readlane_b32 s36, v3, 16 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_readlane_b32 s41, v4, 5 -; CHECK-NEXT: v_readlane_b32 s42, v4, 6 -; CHECK-NEXT: v_readlane_b32 s43, v4, 7 +; CHECK-NEXT: v_readlane_b32 s44, v3, 24 +; CHECK-NEXT: v_readlane_b32 s45, v3, 25 +; CHECK-NEXT: v_readlane_b32 s46, v3, 26 +; CHECK-NEXT: v_readlane_b32 s47, v3, 27 +; CHECK-NEXT: v_readlane_b32 s48, v3, 28 +; CHECK-NEXT: v_readlane_b32 s49, v3, 29 +; CHECK-NEXT: v_readlane_b32 s50, v3, 30 +; CHECK-NEXT: v_readlane_b32 s51, v3, 31 +; CHECK-NEXT: s_and_b64 vcc, exec, -1 +; CHECK-NEXT: v_readlane_b32 s37, v3, 17 +; CHECK-NEXT: v_readlane_b32 s38, v3, 18 +; CHECK-NEXT: v_readlane_b32 s39, v3, 19 +; CHECK-NEXT: v_readlane_b32 s40, v3, 20 +; CHECK-NEXT: v_readlane_b32 s41, v3, 21 +; CHECK-NEXT: v_readlane_b32 s42, v3, 22 +; CHECK-NEXT: v_readlane_b32 s43, v3, 23 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_readlane_b32 s36, v4, 32 -; CHECK-NEXT: v_readlane_b32 s40, v4, 36 -; CHECK-NEXT: v_readlane_b32 s41, v4, 37 -; CHECK-NEXT: v_readlane_b32 s42, v4, 38 -; CHECK-NEXT: v_readlane_b32 s43, v4, 39 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 ; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_readlane_b32 s37, v4, 33 -; CHECK-NEXT: v_readlane_b32 s38, v4, 34 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[60:67], s[40:43] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s39, v4, 35 -; CHECK-NEXT: image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1 +; CHECK-NEXT: image_sample_lz v5, v[1:2], s[60:67], s[8:11] dmask:0x1 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[44:51], s[20:23] dmask:0x1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v1, v1, v6 +; CHECK-NEXT: v_sub_f32_e32 v1, v1, v5 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0 -; CHECK-NEXT: v_mul_f32_e32 v1, v1, v5 +; CHECK-NEXT: v_mul_f32_e32 v1, v1, v4 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 -; CHECK-NEXT: .LBB0_3: ; %Flow14 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_readlane_b32 s12, v4, 32 -; CHECK-NEXT: v_readlane_b32 s13, v4, 33 -; CHECK-NEXT: v_readlane_b32 s14, v4, 34 -; CHECK-NEXT: v_readlane_b32 s15, v4, 35 -; CHECK-NEXT: v_readlane_b32 s16, v4, 36 -; CHECK-NEXT: v_readlane_b32 s17, v4, 37 -; CHECK-NEXT: v_readlane_b32 s18, v4, 38 -; CHECK-NEXT: v_readlane_b32 s19, v4, 39 -; CHECK-NEXT: v_writelane_b32 v4, s4, 40 -; CHECK-NEXT: v_writelane_b32 v4, s5, 41 -; CHECK-NEXT: v_writelane_b32 v4, s6, 42 -; CHECK-NEXT: v_writelane_b32 v4, s7, 43 -; CHECK-NEXT: v_writelane_b32 v4, s8, 44 -; CHECK-NEXT: v_writelane_b32 v4, s9, 45 -; CHECK-NEXT: v_writelane_b32 v4, s10, 46 -; CHECK-NEXT: v_writelane_b32 v4, s11, 47 -; CHECK-NEXT: v_writelane_b32 v4, s12, 48 -; CHECK-NEXT: v_writelane_b32 v4, s13, 49 -; CHECK-NEXT: v_writelane_b32 v4, s14, 50 -; CHECK-NEXT: v_writelane_b32 v4, s15, 51 -; CHECK-NEXT: v_writelane_b32 v4, s16, 52 -; CHECK-NEXT: v_writelane_b32 v4, s17, 53 -; CHECK-NEXT: v_writelane_b32 v4, s18, 54 -; CHECK-NEXT: v_writelane_b32 v4, s19, 55 -; CHECK-NEXT: v_writelane_b32 v4, s52, 56 -; CHECK-NEXT: v_writelane_b32 v3, s60, 0 -; CHECK-NEXT: v_writelane_b32 v4, s53, 57 -; CHECK-NEXT: v_writelane_b32 v3, s61, 1 -; CHECK-NEXT: v_writelane_b32 v4, s54, 58 -; CHECK-NEXT: v_writelane_b32 v3, s62, 2 -; CHECK-NEXT: v_writelane_b32 v4, s55, 59 -; CHECK-NEXT: v_writelane_b32 v3, s63, 3 -; CHECK-NEXT: v_writelane_b32 v4, s56, 60 -; CHECK-NEXT: v_writelane_b32 v3, s64, 4 -; CHECK-NEXT: v_writelane_b32 v4, s57, 61 -; CHECK-NEXT: v_writelane_b32 v3, s65, 5 -; CHECK-NEXT: v_writelane_b32 v4, s58, 62 -; CHECK-NEXT: v_writelane_b32 v3, s66, 6 -; CHECK-NEXT: v_writelane_b32 v4, s59, 63 -; CHECK-NEXT: v_writelane_b32 v3, s67, 7 -; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[26:27] -; CHECK-NEXT: s_cbranch_execz .LBB0_10 -; CHECK-NEXT: ; %bb.4: ; %bb32 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[24:25] -; CHECK-NEXT: s_xor_b64 s[22:23], exec, s[8:9] -; CHECK-NEXT: s_cbranch_execz .LBB0_6 -; CHECK-NEXT: ; %bb.5: ; %bb43 +; CHECK-NEXT: ; %bb.3: ; %Flow +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_or_b64 exec, exec, s[26:27] +; CHECK-NEXT: .LBB0_4: ; %Flow14 +; CHECK-NEXT: s_xor_b64 s[20:21], s[26:27], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[26:27], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[26:27] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_12 +; CHECK-NEXT: ; %bb.5: ; %bb32 +; CHECK-NEXT: s_and_b64 s[8:9], s[24:25], exec +; CHECK-NEXT: s_xor_b64 s[22:23], s[8:9], exec +; CHECK-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[8:9] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_7 +; CHECK-NEXT: ; %bb.6: ; %bb43 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s9, s8 ; CHECK-NEXT: v_mov_b32_e32 v0, s8 -; CHECK-NEXT: v_readlane_b32 s36, v4, 0 +; CHECK-NEXT: v_readlane_b32 s36, v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: s_mov_b32 s10, s8 ; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_readlane_b32 s37, v4, 1 -; CHECK-NEXT: v_readlane_b32 s38, v4, 2 -; CHECK-NEXT: v_readlane_b32 s39, v4, 3 -; CHECK-NEXT: v_readlane_b32 s40, v4, 4 -; CHECK-NEXT: v_readlane_b32 s41, v4, 5 -; CHECK-NEXT: v_readlane_b32 s42, v4, 6 -; CHECK-NEXT: v_readlane_b32 s43, v4, 7 -; CHECK-NEXT: v_readlane_b32 s44, v4, 8 -; CHECK-NEXT: v_readlane_b32 s45, v4, 9 -; CHECK-NEXT: v_readlane_b32 s46, v4, 10 -; CHECK-NEXT: v_readlane_b32 s47, v4, 11 -; CHECK-NEXT: v_readlane_b32 s48, v4, 12 -; CHECK-NEXT: v_readlane_b32 s49, v4, 13 -; CHECK-NEXT: v_readlane_b32 s50, v4, 14 -; CHECK-NEXT: v_readlane_b32 s51, v4, 15 -; CHECK-NEXT: image_sample_lz v5, v[0:1], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s36, v4, 16 -; CHECK-NEXT: v_readlane_b32 s44, v4, 24 -; CHECK-NEXT: v_readlane_b32 s45, v4, 25 -; CHECK-NEXT: v_readlane_b32 s46, v4, 26 -; CHECK-NEXT: v_readlane_b32 s47, v4, 27 -; CHECK-NEXT: v_readlane_b32 s48, v4, 28 -; CHECK-NEXT: v_readlane_b32 s49, v4, 29 -; CHECK-NEXT: v_readlane_b32 s50, v4, 30 -; CHECK-NEXT: v_readlane_b32 s51, v4, 31 -; CHECK-NEXT: v_mov_b32_e32 v6, 0 -; CHECK-NEXT: v_mov_b32_e32 v7, v6 -; CHECK-NEXT: v_readlane_b32 s37, v4, 17 -; CHECK-NEXT: v_readlane_b32 s38, v4, 18 -; CHECK-NEXT: v_readlane_b32 s39, v4, 19 -; CHECK-NEXT: image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s40, v4, 20 -; CHECK-NEXT: v_readlane_b32 s41, v4, 21 -; CHECK-NEXT: v_readlane_b32 s42, v4, 22 -; CHECK-NEXT: v_readlane_b32 s43, v4, 23 +; CHECK-NEXT: v_readlane_b32 s37, v3, 1 +; CHECK-NEXT: v_readlane_b32 s38, v3, 2 +; CHECK-NEXT: v_readlane_b32 s39, v3, 3 +; CHECK-NEXT: v_readlane_b32 s40, v3, 4 +; CHECK-NEXT: v_readlane_b32 s41, v3, 5 +; CHECK-NEXT: v_readlane_b32 s42, v3, 6 +; CHECK-NEXT: v_readlane_b32 s43, v3, 7 +; CHECK-NEXT: s_nop 4 +; CHECK-NEXT: image_sample_lz v4, v[0:1], s[36:43], s[8:11] dmask:0x1 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: image_sample_lz v0, v[0:1], s[12:19], s[4:7] dmask:0x1 +; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v6, v5 +; CHECK-NEXT: v_readlane_b32 s44, v3, 8 +; CHECK-NEXT: v_readlane_b32 s45, v3, 9 +; CHECK-NEXT: v_readlane_b32 s46, v3, 10 +; CHECK-NEXT: v_readlane_b32 s47, v3, 11 +; CHECK-NEXT: v_readlane_b32 s48, v3, 12 +; CHECK-NEXT: v_readlane_b32 s49, v3, 13 +; CHECK-NEXT: v_readlane_b32 s50, v3, 14 +; CHECK-NEXT: v_readlane_b32 s51, v3, 15 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[5:7], off, s[8:11], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[4:6], off, s[8:11], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: .LBB0_6: ; %Flow12 -; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[22:23] -; CHECK-NEXT: v_readlane_b32 s52, v4, 40 -; CHECK-NEXT: v_readlane_b32 s53, v4, 41 -; CHECK-NEXT: v_readlane_b32 s54, v4, 42 -; CHECK-NEXT: v_readlane_b32 s55, v4, 43 -; CHECK-NEXT: v_readlane_b32 s56, v4, 44 -; CHECK-NEXT: v_readlane_b32 s57, v4, 45 -; CHECK-NEXT: v_readlane_b32 s58, v4, 46 -; CHECK-NEXT: v_readlane_b32 s59, v4, 47 -; CHECK-NEXT: v_readlane_b32 s60, v4, 48 -; CHECK-NEXT: v_readlane_b32 s61, v4, 49 -; CHECK-NEXT: v_readlane_b32 s62, v4, 50 -; CHECK-NEXT: v_readlane_b32 s63, v4, 51 -; CHECK-NEXT: v_readlane_b32 s64, v4, 52 -; CHECK-NEXT: v_readlane_b32 s65, v4, 53 -; CHECK-NEXT: v_readlane_b32 s66, v4, 54 -; CHECK-NEXT: v_readlane_b32 s67, v4, 55 -; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_9 -; CHECK-NEXT: ; %bb.7: ; %bb33.preheader +; CHECK-NEXT: s_or_b64 exec, exec, s[22:23] +; CHECK-NEXT: .LBB0_7: ; %Flow12 +; CHECK-NEXT: s_xor_b64 s[4:5], s[22:23], exec +; CHECK-NEXT: s_and_b64 s[6:7], s[22:23], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[22:23] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_11 +; CHECK-NEXT: ; %bb.8: ; %bb33.preheader ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 ; CHECK-NEXT: v_mov_b32_e32 v1, s6 -; CHECK-NEXT: v_readlane_b32 s36, v4, 56 +; CHECK-NEXT: v_readlane_b32 s36, v3, 16 ; CHECK-NEXT: s_mov_b32 s9, s8 ; CHECK-NEXT: s_mov_b32 s10, s8 ; CHECK-NEXT: s_mov_b32 s11, s8 ; CHECK-NEXT: v_mov_b32_e32 v2, s7 -; CHECK-NEXT: v_readlane_b32 s37, v4, 57 -; CHECK-NEXT: v_readlane_b32 s38, v4, 58 -; CHECK-NEXT: v_readlane_b32 s39, v4, 59 -; CHECK-NEXT: v_readlane_b32 s40, v4, 60 -; CHECK-NEXT: v_readlane_b32 s41, v4, 61 -; CHECK-NEXT: v_readlane_b32 s42, v4, 62 -; CHECK-NEXT: v_readlane_b32 s43, v4, 63 -; CHECK-NEXT: s_nop 4 -; CHECK-NEXT: image_sample_lz v5, v[1:2], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[52:59], s[8:11] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s37, v3, 17 +; CHECK-NEXT: v_readlane_b32 s38, v3, 18 +; CHECK-NEXT: v_readlane_b32 s39, v3, 19 +; CHECK-NEXT: v_readlane_b32 s40, v3, 20 +; CHECK-NEXT: v_readlane_b32 s41, v3, 21 +; CHECK-NEXT: v_readlane_b32 s42, v3, 22 +; CHECK-NEXT: v_readlane_b32 s43, v3, 23 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1 ; CHECK-NEXT: ; kill: killed $vgpr1_vgpr2 ; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37] ; CHECK-NEXT: s_and_b64 vcc, exec, 0 -; CHECK-NEXT: v_readlane_b32 s44, v3, 0 -; CHECK-NEXT: v_readlane_b32 s45, v3, 1 -; CHECK-NEXT: v_readlane_b32 s46, v3, 2 -; CHECK-NEXT: v_readlane_b32 s47, v3, 3 -; CHECK-NEXT: v_readlane_b32 s48, v3, 4 -; CHECK-NEXT: v_readlane_b32 s49, v3, 5 -; CHECK-NEXT: v_readlane_b32 s50, v3, 6 -; CHECK-NEXT: v_readlane_b32 s51, v3, 7 +; CHECK-NEXT: v_readlane_b32 s44, v3, 24 +; CHECK-NEXT: v_readlane_b32 s45, v3, 25 +; CHECK-NEXT: image_sample_lz v5, v[1:2], s[36:43], s[8:11] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s46, v3, 26 +; CHECK-NEXT: v_readlane_b32 s47, v3, 27 +; CHECK-NEXT: v_readlane_b32 s48, v3, 28 +; CHECK-NEXT: v_readlane_b32 s49, v3, 29 +; CHECK-NEXT: v_readlane_b32 s50, v3, 30 +; CHECK-NEXT: v_readlane_b32 s51, v3, 31 ; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39] ; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41] ; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43] -; CHECK-NEXT: ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 +; CHECK-NEXT: ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v1, v6, v5 +; CHECK-NEXT: v_sub_f32_e32 v1, v5, v4 ; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: .LBB0_8: ; %bb33 +; CHECK-NEXT: .LBB0_9: ; %bb33 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_f32_e32 v2, v1, v0 ; CHECK-NEXT: v_sub_f32_e32 v1, v1, v2 ; CHECK-NEXT: s_mov_b64 vcc, vcc -; CHECK-NEXT: s_cbranch_vccz .LBB0_8 -; CHECK-NEXT: .LBB0_9: ; %Flow13 +; CHECK-NEXT: s_cbranch_vccz .LBB0_9 +; CHECK-NEXT: ; %bb.10: ; %Flow11 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock +; CHECK-NEXT: .LBB0_11: ; %Flow13 ; CHECK-NEXT: s_or_b64 exec, exec, s[20:21] -; CHECK-NEXT: v_readlane_b32 s67, v8, 33 -; CHECK-NEXT: v_readlane_b32 s66, v8, 32 -; CHECK-NEXT: v_readlane_b32 s65, v8, 31 -; CHECK-NEXT: v_readlane_b32 s64, v8, 30 -; CHECK-NEXT: v_readlane_b32 s63, v8, 29 -; CHECK-NEXT: v_readlane_b32 s62, v8, 28 -; CHECK-NEXT: v_readlane_b32 s61, v8, 27 -; CHECK-NEXT: v_readlane_b32 s60, v8, 26 -; CHECK-NEXT: v_readlane_b32 s59, v8, 25 -; CHECK-NEXT: v_readlane_b32 s58, v8, 24 -; CHECK-NEXT: v_readlane_b32 s57, v8, 23 -; CHECK-NEXT: v_readlane_b32 s56, v8, 22 -; CHECK-NEXT: v_readlane_b32 s55, v8, 21 -; CHECK-NEXT: v_readlane_b32 s54, v8, 20 -; CHECK-NEXT: v_readlane_b32 s53, v8, 19 -; CHECK-NEXT: v_readlane_b32 s52, v8, 18 -; CHECK-NEXT: v_readlane_b32 s51, v8, 17 -; CHECK-NEXT: v_readlane_b32 s50, v8, 16 -; CHECK-NEXT: v_readlane_b32 s49, v8, 15 -; CHECK-NEXT: v_readlane_b32 s48, v8, 14 -; CHECK-NEXT: v_readlane_b32 s47, v8, 13 -; CHECK-NEXT: v_readlane_b32 s46, v8, 12 -; CHECK-NEXT: v_readlane_b32 s45, v8, 11 -; CHECK-NEXT: v_readlane_b32 s44, v8, 10 -; CHECK-NEXT: v_readlane_b32 s43, v8, 9 -; CHECK-NEXT: v_readlane_b32 s42, v8, 8 -; CHECK-NEXT: v_readlane_b32 s41, v8, 7 -; CHECK-NEXT: v_readlane_b32 s40, v8, 6 -; CHECK-NEXT: v_readlane_b32 s39, v8, 5 -; CHECK-NEXT: v_readlane_b32 s38, v8, 4 -; CHECK-NEXT: v_readlane_b32 s37, v8, 3 -; CHECK-NEXT: v_readlane_b32 s36, v8, 2 -; CHECK-NEXT: v_readlane_b32 s31, v8, 1 -; CHECK-NEXT: v_readlane_b32 s30, v8, 0 -; CHECK-NEXT: ; kill: killed $vgpr4 +; CHECK-NEXT: .LBB0_12: ; %UnifiedReturnBlock +; CHECK-NEXT: v_readlane_b32 s67, v7, 35 +; CHECK-NEXT: v_readlane_b32 s66, v7, 34 +; CHECK-NEXT: v_readlane_b32 s65, v7, 33 +; CHECK-NEXT: v_readlane_b32 s64, v7, 32 +; CHECK-NEXT: v_readlane_b32 s63, v7, 31 +; CHECK-NEXT: v_readlane_b32 s62, v7, 30 +; CHECK-NEXT: v_readlane_b32 s61, v7, 29 +; CHECK-NEXT: v_readlane_b32 s60, v7, 28 +; CHECK-NEXT: v_readlane_b32 s59, v7, 27 +; CHECK-NEXT: v_readlane_b32 s58, v7, 26 +; CHECK-NEXT: v_readlane_b32 s57, v7, 25 +; CHECK-NEXT: v_readlane_b32 s56, v7, 24 +; CHECK-NEXT: v_readlane_b32 s55, v7, 23 +; CHECK-NEXT: v_readlane_b32 s54, v7, 22 +; CHECK-NEXT: v_readlane_b32 s53, v7, 21 +; CHECK-NEXT: v_readlane_b32 s52, v7, 20 +; CHECK-NEXT: v_readlane_b32 s51, v7, 19 +; CHECK-NEXT: v_readlane_b32 s50, v7, 18 +; CHECK-NEXT: v_readlane_b32 s49, v7, 17 +; CHECK-NEXT: v_readlane_b32 s48, v7, 16 +; CHECK-NEXT: v_readlane_b32 s47, v7, 15 +; CHECK-NEXT: v_readlane_b32 s46, v7, 14 +; CHECK-NEXT: v_readlane_b32 s45, v7, 13 +; CHECK-NEXT: v_readlane_b32 s44, v7, 12 +; CHECK-NEXT: v_readlane_b32 s43, v7, 11 +; CHECK-NEXT: v_readlane_b32 s42, v7, 10 +; CHECK-NEXT: v_readlane_b32 s41, v7, 9 +; CHECK-NEXT: v_readlane_b32 s40, v7, 8 +; CHECK-NEXT: v_readlane_b32 s39, v7, 7 +; CHECK-NEXT: v_readlane_b32 s38, v7, 6 +; CHECK-NEXT: v_readlane_b32 s37, v7, 5 +; CHECK-NEXT: v_readlane_b32 s36, v7, 4 +; CHECK-NEXT: v_readlane_b32 s35, v7, 3 +; CHECK-NEXT: v_readlane_b32 s34, v7, 2 +; CHECK-NEXT: v_readlane_b32 s31, v7, 1 +; CHECK-NEXT: v_readlane_b32 s30, v7, 0 ; CHECK-NEXT: ; kill: killed $vgpr3 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll index 5cb9721ff7457..e0f8eff644fa2 100644 --- a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll +++ b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll @@ -24,10 +24,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, < ; GCN-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[[[SREG0]]:[[SREG7]]], {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 +; GCN-NEXT: s_xor_b64 [[LOOP_MASK:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE]] +; GCN-NEXT: s_and_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LOOP_MASK]], -1 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GCN-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]] -; GCN-NEXT: s_cbranch_execnz [[RSRC_LOOP]] +; GCN-NEXT: s_cselect_b64 exec, [[LOOP_MASK]], [[SAVE]] +; GCN-NEXT: s_cbranch_scc1 [[RSRC_LOOP]] define amdgpu_ps <4 x float> @water_loop_rsrc(<8 x i32> %rsrc, <4 x i32> inreg %samp, float %s, float %t) { main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -50,10 +52,12 @@ main_body: ; GCN-NEXT: s_nop 0 ; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s[[[SREG0]]:[[SREG3]]] dmask:0x1 +; GCN-NEXT: s_xor_b64 [[LOOP_MASK:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE]] +; GCN-NEXT: s_and_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LOOP_MASK]], -1 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]] -; GCN-NEXT: s_cbranch_execnz [[SAMP_LOOP]] +; GCN-NEXT: s_cselect_b64 exec, [[LOOP_MASK]], [[SAVE]] +; GCN-NEXT: s_cbranch_scc1 [[SAMP_LOOP]] define amdgpu_ps <4 x float> @water_loop_samp(<8 x i32> inreg %rsrc, <4 x i32> %samp, float %s, float %t) { main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 7799b9509ceb0..daed7d04abefd 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -126,7 +126,7 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 18 +; GCN-NEXT: v_writelane_b32 v40, s16, 16 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -144,8 +144,6 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s45, 13 ; GCN-NEXT: v_writelane_b32 v40, s46, 14 ; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 ; GCN-NEXT: s_mov_b32 s42, s15 ; GCN-NEXT: s_mov_b32 s43, s14 ; GCN-NEXT: s_mov_b32 s44, s13 @@ -154,12 +152,11 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] ; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] -; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -171,12 +168,11 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] -; GCN-NEXT: s_cbranch_execnz .LBB2_1 +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[46:47] +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[46:47] +; GCN-NEXT: s_cbranch_scc1 .LBB2_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 ; GCN-NEXT: v_readlane_b32 s47, v40, 15 ; GCN-NEXT: v_readlane_b32 s46, v40, 14 ; GCN-NEXT: v_readlane_b32 s45, v40, 13 @@ -193,7 +189,7 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v40, 18 +; GCN-NEXT: v_readlane_b32 s4, v40, 16 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -210,7 +206,7 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v40, s16, 18 +; GISEL-NEXT: v_writelane_b32 v40, s16, 16 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -228,8 +224,6 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s45, 13 ; GISEL-NEXT: v_writelane_b32 v40, s46, 14 ; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 ; GISEL-NEXT: s_mov_b32 s42, s15 ; GISEL-NEXT: s_mov_b32 s43, s14 ; GISEL-NEXT: s_mov_b32 s44, s13 @@ -238,12 +232,11 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] ; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] -; GISEL-NEXT: s_mov_b64 s[46:47], exec ; GISEL-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -255,12 +248,11 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] -; GISEL-NEXT: s_cbranch_execnz .LBB2_1 +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[46:47] +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[46:47] +; GISEL-NEXT: s_cbranch_scc1 .LBB2_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[46:47] -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 ; GISEL-NEXT: v_readlane_b32 s47, v40, 15 ; GISEL-NEXT: v_readlane_b32 s46, v40, 14 ; GISEL-NEXT: v_readlane_b32 s45, v40, 13 @@ -277,7 +269,7 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v40, 18 +; GISEL-NEXT: v_readlane_b32 s4, v40, 16 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] @@ -298,7 +290,7 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 18 +; GCN-NEXT: v_writelane_b32 v40, s16, 16 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -316,8 +308,6 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s45, 13 ; GCN-NEXT: v_writelane_b32 v40, s46, 14 ; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 ; GCN-NEXT: s_mov_b32 s42, s15 ; GCN-NEXT: s_mov_b32 s43, s14 ; GCN-NEXT: s_mov_b32 s44, s13 @@ -326,13 +316,12 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] ; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] -; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: v_mov_b32_e32 v2, 0x7b ; GCN-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -346,12 +335,11 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 ; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] -; GCN-NEXT: s_cbranch_execnz .LBB3_1 +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[46:47] +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[46:47] +; GCN-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 ; GCN-NEXT: v_readlane_b32 s47, v40, 15 ; GCN-NEXT: v_readlane_b32 s46, v40, 14 ; GCN-NEXT: v_readlane_b32 s45, v40, 13 @@ -368,7 +356,7 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v40, 18 +; GCN-NEXT: v_readlane_b32 s4, v40, 16 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -385,7 +373,7 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v40, s16, 18 +; GISEL-NEXT: v_writelane_b32 v40, s16, 16 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -403,8 +391,6 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s45, 13 ; GISEL-NEXT: v_writelane_b32 v40, s46, 14 ; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 ; GISEL-NEXT: s_mov_b32 s42, s15 ; GISEL-NEXT: s_mov_b32 s43, s14 ; GISEL-NEXT: s_mov_b32 s44, s13 @@ -413,12 +399,11 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] ; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] -; GISEL-NEXT: s_mov_b64 s[46:47], exec ; GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -431,12 +416,11 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] -; GISEL-NEXT: s_cbranch_execnz .LBB3_1 +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[46:47] +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[46:47] +; GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[46:47] -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 ; GISEL-NEXT: v_readlane_b32 s47, v40, 15 ; GISEL-NEXT: v_readlane_b32 s46, v40, 14 ; GISEL-NEXT: v_readlane_b32 s45, v40, 13 @@ -453,7 +437,7 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v40, 18 +; GISEL-NEXT: v_readlane_b32 s4, v40, 16 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] @@ -474,7 +458,7 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 18 +; GCN-NEXT: v_writelane_b32 v40, s16, 16 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -492,8 +476,6 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s45, 13 ; GCN-NEXT: v_writelane_b32 v40, s46, 14 ; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 ; GCN-NEXT: s_mov_b32 s42, s15 ; GCN-NEXT: s_mov_b32 s43, s14 ; GCN-NEXT: s_mov_b32 s44, s13 @@ -502,12 +484,11 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] ; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] -; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -520,13 +501,12 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] -; GCN-NEXT: s_cbranch_execnz .LBB4_1 +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[46:47] +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[46:47] +; GCN-NEXT: s_cbranch_scc1 .LBB4_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] ; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v2 -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 ; GCN-NEXT: v_readlane_b32 s47, v40, 15 ; GCN-NEXT: v_readlane_b32 s46, v40, 14 ; GCN-NEXT: v_readlane_b32 s45, v40, 13 @@ -543,7 +523,7 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v40, 18 +; GCN-NEXT: v_readlane_b32 s4, v40, 16 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -560,7 +540,7 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v40, s16, 18 +; GISEL-NEXT: v_writelane_b32 v40, s16, 16 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -578,8 +558,6 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s45, 13 ; GISEL-NEXT: v_writelane_b32 v40, s46, 14 ; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 ; GISEL-NEXT: s_mov_b32 s42, s15 ; GISEL-NEXT: s_mov_b32 s43, s14 ; GISEL-NEXT: s_mov_b32 s44, s13 @@ -588,12 +566,11 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: s_mov_b64 s[36:37], s[8:9] ; GISEL-NEXT: s_mov_b64 s[38:39], s[6:7] ; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] -; GISEL-NEXT: s_mov_b64 s[46:47], exec ; GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[46:47], vcc ; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -606,13 +583,12 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49] -; GISEL-NEXT: s_cbranch_execnz .LBB4_1 +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[46:47] +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[46:47] +; GISEL-NEXT: s_cbranch_scc1 .LBB4_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[46:47] ; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v1 -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 ; GISEL-NEXT: v_readlane_b32 s47, v40, 15 ; GISEL-NEXT: v_readlane_b32 s46, v40, 14 ; GISEL-NEXT: v_readlane_b32 s45, v40, 13 @@ -629,7 +605,7 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v40, 18 +; GISEL-NEXT: v_readlane_b32 s4, v40, 16 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] @@ -651,7 +627,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 20 +; GCN-NEXT: v_writelane_b32 v40, s16, 18 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -671,8 +647,6 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: v_writelane_b32 v40, s47, 15 ; GCN-NEXT: v_writelane_b32 v40, s48, 16 ; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: v_writelane_b32 v40, s50, 18 -; GCN-NEXT: v_writelane_b32 v40, s51, 19 ; GCN-NEXT: s_mov_b32 s42, s15 ; GCN-NEXT: s_mov_b32 s43, s14 ; GCN-NEXT: s_mov_b32 s44, s13 @@ -683,15 +657,15 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc -; GCN-NEXT: s_cbranch_execz .LBB5_4 -; GCN-NEXT: ; %bb.1: ; %bb1 -; GCN-NEXT: s_mov_b64 s[48:49], exec -; GCN-NEXT: .LBB5_2: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB5_3 +; GCN-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[50:51], vcc +; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -703,14 +677,13 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_xor_b64 exec, exec, s[50:51] -; GCN-NEXT: s_cbranch_execnz .LBB5_2 -; GCN-NEXT: ; %bb.3: -; GCN-NEXT: s_mov_b64 exec, s[48:49] -; GCN-NEXT: .LBB5_4: ; %bb2 +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[48:49] +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[48:49] +; GCN-NEXT: s_cbranch_scc1 .LBB5_1 +; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_or_b64 exec, exec, s[46:47] -; GCN-NEXT: v_readlane_b32 s51, v40, 19 -; GCN-NEXT: v_readlane_b32 s50, v40, 18 +; GCN-NEXT: .LBB5_3: ; %bb2 ; GCN-NEXT: v_readlane_b32 s49, v40, 17 ; GCN-NEXT: v_readlane_b32 s48, v40, 16 ; GCN-NEXT: v_readlane_b32 s47, v40, 15 @@ -729,7 +702,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v40, 20 +; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -746,7 +719,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v40, s16, 20 +; GISEL-NEXT: v_writelane_b32 v40, s16, 18 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -766,8 +739,6 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: v_writelane_b32 v40, s47, 15 ; GISEL-NEXT: v_writelane_b32 v40, s48, 16 ; GISEL-NEXT: v_writelane_b32 v40, s49, 17 -; GISEL-NEXT: v_writelane_b32 v40, s50, 18 -; GISEL-NEXT: v_writelane_b32 v40, s51, 19 ; GISEL-NEXT: s_mov_b32 s42, s15 ; GISEL-NEXT: s_mov_b32 s43, s14 ; GISEL-NEXT: s_mov_b32 s44, s13 @@ -778,15 +749,15 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: s_mov_b64 s[40:41], s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, 1, v2 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: s_and_saveexec_b64 s[46:47], vcc -; GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GISEL-NEXT: ; %bb.1: ; %bb1 -; GISEL-NEXT: s_mov_b64 s[48:49], exec -; GISEL-NEXT: .LBB5_2: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_mov_b64 s[46:47], exec +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB5_3 +; GISEL-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s16, v0 ; GISEL-NEXT: v_readfirstlane_b32 s17, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[50:51], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GISEL-NEXT: s_mov_b64 s[4:5], s[40:41] ; GISEL-NEXT: s_mov_b64 s[6:7], s[38:39] ; GISEL-NEXT: s_mov_b64 s[8:9], s[36:37] @@ -798,14 +769,13 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr31 -; GISEL-NEXT: s_xor_b64 exec, exec, s[50:51] -; GISEL-NEXT: s_cbranch_execnz .LBB5_2 -; GISEL-NEXT: ; %bb.3: -; GISEL-NEXT: s_mov_b64 exec, s[48:49] -; GISEL-NEXT: .LBB5_4: ; %bb2 +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[48:49] +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[48:49] +; GISEL-NEXT: s_cbranch_scc1 .LBB5_1 +; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_or_b64 exec, exec, s[46:47] -; GISEL-NEXT: v_readlane_b32 s51, v40, 19 -; GISEL-NEXT: v_readlane_b32 s50, v40, 18 +; GISEL-NEXT: .LBB5_3: ; %bb2 ; GISEL-NEXT: v_readlane_b32 s49, v40, 17 ; GISEL-NEXT: v_readlane_b32 s48, v40, 16 ; GISEL-NEXT: v_readlane_b32 s47, v40, 15 @@ -824,7 +794,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v40, 20 +; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] @@ -847,11 +817,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s5, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -885,19 +855,19 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s61, 29 ; GCN-NEXT: v_writelane_b32 v40, s62, 30 ; GCN-NEXT: v_writelane_b32 v40, s63, 31 -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: s_movk_i32 s4, 0x7b +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_xor_b64 exec, exec, s[10:11] -; GCN-NEXT: s_cbranch_execnz .LBB6_1 +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[8:9] +; GCN-NEXT: s_cbranch_scc1 .LBB6_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_readlane_b32 s63, v40, 31 ; GCN-NEXT: v_readlane_b32 s62, v40, 30 ; GCN-NEXT: v_readlane_b32 s61, v40, 29 @@ -930,22 +900,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s5 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s5, s33 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 -; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[6:7] +; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -979,19 +949,19 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s61, 29 ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 -; GISEL-NEXT: s_mov_b64 s[6:7], exec -; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s8, v0 ; GISEL-NEXT: v_readfirstlane_b32 s9, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11] -; GISEL-NEXT: s_cbranch_execnz .LBB6_1 +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GISEL-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GISEL-NEXT: s_cbranch_scc1 .LBB6_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: v_readlane_b32 s63, v40, 31 ; GISEL-NEXT: v_readlane_b32 s62, v40, 30 ; GISEL-NEXT: v_readlane_b32 s61, v40, 29 @@ -1024,11 +994,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GISEL-NEXT: s_mov_b64 exec, s[6:7] +; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s5 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 inreg 123) @@ -1079,19 +1049,19 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v41, s62, 30 ; GCN-NEXT: v_writelane_b32 v41, s63, 31 ; GCN-NEXT: v_mov_b32_e32 v40, v0 -; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s6, v1 -; GCN-NEXT: v_readfirstlane_b32 s7, v2 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: v_readfirstlane_b32 s5, v2 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2] +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: v_mov_b32_e32 v0, v40 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] -; GCN-NEXT: s_cbranch_execnz .LBB7_1 +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GCN-NEXT: s_cbranch_scc1 .LBB7_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v40 ; GCN-NEXT: v_readlane_b32 s63, v41, 31 ; GCN-NEXT: v_readlane_b32 s62, v41, 30 @@ -1177,19 +1147,19 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v41, s62, 30 ; GISEL-NEXT: v_writelane_b32 v41, s63, 31 ; GISEL-NEXT: v_mov_b32_e32 v40, v0 -; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s6, v1 ; GISEL-NEXT: v_readfirstlane_b32 s7, v2 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] -; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v40 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr1 -; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] -; GISEL-NEXT: s_cbranch_execnz .LBB7_1 +; GISEL-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v40 ; GISEL-NEXT: v_readlane_b32 s63, v41, 31 ; GISEL-NEXT: v_readlane_b32 s62, v41, 30 @@ -1281,20 +1251,20 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s61, 29 ; GCN-NEXT: v_writelane_b32 v40, s62, 30 ; GCN-NEXT: v_writelane_b32 v40, s63, 31 -; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s8, v1 -; GCN-NEXT: v_readfirstlane_b32 s9, v2 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: v_readfirstlane_b32 s7, v2 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v3, v0 ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_xor_b64 exec, exec, s[6:7] -; GCN-NEXT: s_cbranch_execnz .LBB8_1 +; GCN-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB8_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v3 ; GCN-NEXT: v_readlane_b32 s63, v40, 31 ; GCN-NEXT: v_readlane_b32 s62, v40, 30 @@ -1377,20 +1347,20 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s61, 29 ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 -; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_readfirstlane_b32 s8, v1 -; GISEL-NEXT: v_readfirstlane_b32 s9, v2 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GISEL-NEXT: v_readfirstlane_b32 s6, v1 +; GISEL-NEXT: v_readfirstlane_b32 s7, v2 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GISEL-NEXT: ; implicit-def: $vgpr1 ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: s_xor_b64 exec, exec, s[6:7] -; GISEL-NEXT: s_cbranch_execnz .LBB8_1 +; GISEL-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB8_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v2 ; GISEL-NEXT: v_readlane_b32 s63, v40, 31 ; GISEL-NEXT: v_readlane_b32 s62, v40, 30 @@ -1478,18 +1448,18 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s61, 29 ; GCN-NEXT: v_writelane_b32 v40, s62, 30 ; GCN-NEXT: v_writelane_b32 v40, s63, 31 -; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] -; GCN-NEXT: s_cbranch_execnz .LBB9_1 +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GCN-NEXT: s_cbranch_scc1 .LBB9_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_readlane_b32 s63, v40, 31 ; GCN-NEXT: v_readlane_b32 s62, v40, 30 ; GCN-NEXT: v_readlane_b32 s61, v40, 29 @@ -1571,18 +1541,18 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s61, 29 ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 -; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s6, v0 ; GISEL-NEXT: v_readfirstlane_b32 s7, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] -; GISEL-NEXT: s_cbranch_execnz .LBB9_1 +; GISEL-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GISEL-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_readlane_b32 s63, v40, 31 ; GISEL-NEXT: v_readlane_b32 s62, v40, 30 ; GISEL-NEXT: v_readlane_b32 s61, v40, 29 diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index 8183106b0ce9d..bb6b5f5ffaa89 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s ; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll index 555af5013bc4e..b180c39edb770 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index cddfb21a6fbdf..75ef72cbf225f 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -57,16 +57,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_mov_b32 s12, s13 ; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] ; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b32 s20, exec_lo ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_mov_b32 s0, -1 -; GFX11-NEXT: s_mov_b32 s20, exec_lo ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v0, s21, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB2_13 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB2_13 ; GFX11-NEXT: ; %bb.1: ; %bb14 ; GFX11-NEXT: s_load_b128 s[16:19], s[2:3], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -171,10 +173,14 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: .LBB2_12: ; %Flow11 ; GFX11-NEXT: s_and_b32 s6, s1, exec_lo ; GFX11-NEXT: s_or_not1_b32 s0, s17, exec_lo -; GFX11-NEXT: .LBB2_13: ; %Flow9 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s20 -; GFX11-NEXT: s_and_saveexec_b32 s7, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_15 +; GFX11-NEXT: .LBB2_13: ; %Flow9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, exec_lo +; GFX11-NEXT: s_mov_b32 s7, exec_lo +; GFX11-NEXT: s_and_b32 s1, s0, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, s0 +; GFX11-NEXT: s_cbranch_scc0 .LBB2_15 ; GFX11-NEXT: ; %bb.14: ; %bb43 ; GFX11-NEXT: s_add_u32 s8, s2, 0x58 ; GFX11-NEXT: s_addc_u32 s9, s3, 0 @@ -187,12 +193,16 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_or_b32 s6, s6, exec_lo -; GFX11-NEXT: .LBB2_15: ; %Flow14 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; GFX11-NEXT: s_and_saveexec_b32 s0, s6 +; GFX11-NEXT: .LBB2_15: ; %Flow14 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s6, exec_lo +; GFX11-NEXT: s_and_b32 s1, s0, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, s0 +; GFX11-NEXT: s_cbranch_scc0 .LBB2_17 ; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock ; GFX11-NEXT: ; divergent unreachable -; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock +; GFX11-NEXT: .LBB2_17: ; %UnifiedReturnBlock ; GFX11-NEXT: s_endpgm bb: %i = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index df03e89370377..8e0a238b35373 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -23,11 +23,12 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: syncscope_workgroup_nortn: @@ -43,11 +44,12 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: syncscope_workgroup_nortn: @@ -66,10 +68,11 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_and_b32 s6, s5, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLATSCR-LABEL: syncscope_workgroup_nortn: @@ -85,11 +88,12 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-FLATSCR-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-FLATSCR-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-FLATSCR-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-FLATSCR-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-FLATSCR-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: syncscope_workgroup_nortn: @@ -100,7 +104,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -109,11 +112,12 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_and_b32 s2, s1, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: syncscope_workgroup_nortn: @@ -128,7 +132,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN @@ -137,11 +140,12 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_and_b32 s2, s1, -1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret void @@ -167,10 +171,11 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -192,10 +197,11 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX90A-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX90A-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -217,10 +223,11 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4 +; GFX10-NEXT: s_and_b32 s6, s5, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX10-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -240,10 +247,11 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX9-FLATSCR-NEXT: buffer_wbinvl1_vol ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX9-FLATSCR-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-FLATSCR-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-FLATSCR-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-FLATSCR-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX9-FLATSCR-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -266,11 +274,12 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX11-NEXT: s_and_b32 s2, s1, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -296,11 +305,12 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB1_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s1, exec_lo, s0 +; GFX12-NEXT: s_and_b32 s2, s1, -1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst @@ -696,8 +706,9 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -716,8 +727,9 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB5_2 +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -735,8 +747,9 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX10-NEXT: s_mov_b32 s2, exec_lo ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB5_2 +; GFX10-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -756,8 +769,9 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-FLATSCR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-FLATSCR-NEXT: s_cmov_b64 exec, vcc +; GFX9-FLATSCR-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; GFX9-FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -773,11 +787,12 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX11-LABEL: atomic_add_local: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -794,11 +809,12 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX12-LABEL: atomic_add_local: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB5_2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX12-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -893,9 +909,11 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -905,8 +923,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB7_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB7_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -922,9 +940,11 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB7_2 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -934,8 +954,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB7_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: .LBB7_2: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 @@ -948,11 +968,13 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX10-LABEL: atomic_add_ret_local: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s3, exec_lo -; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: s_mov_b32 s2, exec_lo ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB7_2 +; GFX10-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -963,9 +985,9 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: .LBB7_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: .LBB7_2: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 @@ -981,9 +1003,11 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec +; GFX9-FLATSCR-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-FLATSCR-NEXT: s_cmov_b64 exec, vcc +; GFX9-FLATSCR-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; GFX9-FLATSCR-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -993,8 +1017,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: .LBB7_2: ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-FLATSCR-NEXT: .LBB7_2: ; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 @@ -1011,8 +1035,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1023,8 +1049,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: .LBB7_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: .LBB7_2: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 @@ -1042,8 +1068,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB7_2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1054,8 +1082,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: .LBB7_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: .LBB7_2: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 @@ -1082,9 +1110,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1093,8 +1123,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -1110,9 +1140,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB8_2 +; GFX90A-NEXT: s_cmov_b64 exec, vcc +; GFX90A-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1121,8 +1153,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: .LBB8_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: .LBB8_2: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 @@ -1135,11 +1167,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: add_i32_constant: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s3, exec_lo -; GFX10-NEXT: ; implicit-def: $vgpr1 +; GFX10-NEXT: s_mov_b32 s2, exec_lo ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB8_2 +; GFX10-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1148,9 +1182,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: .LBB8_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: .LBB8_2: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 @@ -1166,9 +1200,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec +; GFX9-FLATSCR-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-FLATSCR-NEXT: s_cmov_b64 exec, vcc +; GFX9-FLATSCR-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -1177,8 +1213,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: .LBB8_2: ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-FLATSCR-NEXT: .LBB8_2: ; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 @@ -1195,8 +1231,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1206,8 +1244,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: .LBB8_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: .LBB8_2: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 @@ -1225,8 +1263,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB8_2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX12-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX12-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1236,8 +1276,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: .LBB8_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: .LBB8_2: ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll index f950717c591a9..57df1bfb3bf45 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll @@ -13,9 +13,11 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_or_b32_e32 v5, v1, v3 ; GCN-NEXT: v_or_b32_e32 v4, v0, v2 ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_14 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_14 ; GCN-NEXT: ; %bb.1: ; %itofp-if-end ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v3 ; GCN-NEXT: v_xor_b32_e32 v0, v5, v0 @@ -38,11 +40,13 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_add_u32_e32 v6, 64, v6 ; GCN-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc ; GCN-NEXT: v_sub_u32_e32 v6, 0x80, v7 -; GCN-NEXT: v_sub_u32_e32 v2, 0x7f, v7 ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, exec +; GCN-NEXT: s_and_b64 s[8:9], vcc, -1 +; GCN-NEXT: v_sub_u32_e32 v2, 0x7f, v7 ; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %itofp-if-else ; GCN-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 ; GCN-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -52,18 +56,24 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr7 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GCN-NEXT: ; %bb.3: ; %Flow3 -; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_13 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB0_3: ; %Flow3 +; GCN-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GCN-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB0_13 ; GCN-NEXT: ; %bb.4: ; %NodeBlock ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_8 +; GCN-NEXT: s_xor_b64 s[10:11], vcc, exec +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_8 ; GCN-NEXT: ; %bb.5: ; %LeafBlock ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 -; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_7 +; GCN-NEXT: s_mov_b64 s[12:13], exec +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_7 ; GCN-NEXT: ; %bb.6: ; %itofp-sw-default ; GCN-NEXT: v_sub_u32_e32 v12, 0x66, v7 ; GCN-NEXT: v_sub_u32_e32 v10, 64, v12 @@ -102,29 +112,36 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_or_b32_e32 v8, v15, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v8 ; GCN-NEXT: v_mov_b32_e32 v1, v9 -; GCN-NEXT: .LBB0_7: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN-NEXT: .LBB0_7: ; %Flow1 +; GCN-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-NEXT: .LBB0_8: ; %Flow2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GCN-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GCN-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; GCN-NEXT: s_cmov_b64 exec, s[10:11] +; GCN-NEXT: s_cbranch_scc0 .LBB0_10 ; GCN-NEXT: ; %bb.9: ; %itofp-sw-bb ; GCN-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GCN-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB0_10: ; %itofp-sw-epilog ; GCN-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GCN-NEXT: v_and_or_b32 v0, v4, 1, v0 ; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_and_b32_e32 v4, 0x4000000, v0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_and_b64 s[10:11], vcc, -1 ; GCN-NEXT: v_alignbit_b32 v8, v1, v0, 2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_12 ; GCN-NEXT: ; %bb.11: ; %itofp-if-then20 ; GCN-NEXT: v_alignbit_b32 v8, v1, v0, 3 ; GCN-NEXT: v_mov_b32_e32 v2, v6 -; GCN-NEXT: ; %bb.12: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: .LBB0_13: ; %Flow4 +; GCN-NEXT: .LBB0_12: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: .LBB0_13: ; %itofp-if-end26 ; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v3 ; GCN-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 @@ -136,8 +153,8 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: .LBB0_14: ; %Flow5 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB0_14: ; %itofp-return ; GCN-NEXT: v_mov_b32_e32 v0, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] %cvt = sitofp i128 %x to bfloat @@ -151,9 +168,11 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_or_b32_e32 v5, v1, v3 ; GCN-NEXT: v_or_b32_e32 v4, v0, v2 ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_14 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_14 ; GCN-NEXT: ; %bb.1: ; %itofp-if-end ; GCN-NEXT: v_ffbh_u32_e32 v4, v2 ; GCN-NEXT: v_add_u32_e32 v4, 32, v4 @@ -167,11 +186,13 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_add_u32_e32 v5, 64, v5 ; GCN-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GCN-NEXT: v_sub_u32_e32 v5, 0x80, v6 -; GCN-NEXT: v_sub_u32_e32 v4, 0x7f, v6 ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, exec +; GCN-NEXT: s_and_b64 s[8:9], vcc, -1 +; GCN-NEXT: v_sub_u32_e32 v4, 0x7f, v6 ; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_3 ; GCN-NEXT: ; %bb.2: ; %itofp-if-else ; GCN-NEXT: v_add_u32_e32 v2, 0xffffff98, v6 ; GCN-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -181,18 +202,24 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr6 ; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN-NEXT: ; %bb.3: ; %Flow3 -; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_13 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB1_3: ; %Flow3 +; GCN-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GCN-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB1_13 ; GCN-NEXT: ; %bb.4: ; %NodeBlock ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_8 +; GCN-NEXT: s_xor_b64 s[10:11], vcc, exec +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_8 ; GCN-NEXT: ; %bb.5: ; %LeafBlock ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 -; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_7 +; GCN-NEXT: s_mov_b64 s[12:13], exec +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_7 ; GCN-NEXT: ; %bb.6: ; %itofp-sw-default ; GCN-NEXT: v_sub_u32_e32 v11, 0x66, v6 ; GCN-NEXT: v_sub_u32_e32 v9, 64, v11 @@ -231,29 +258,36 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_or_b32_e32 v7, v14, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v7 ; GCN-NEXT: v_mov_b32_e32 v1, v8 -; GCN-NEXT: .LBB1_7: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN-NEXT: .LBB1_7: ; %Flow1 +; GCN-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-NEXT: .LBB1_8: ; %Flow2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GCN-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GCN-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; GCN-NEXT: s_cmov_b64 exec, s[10:11] +; GCN-NEXT: s_cbranch_scc0 .LBB1_10 ; GCN-NEXT: ; %bb.9: ; %itofp-sw-bb ; GCN-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GCN-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB1_10: ; %itofp-sw-epilog ; GCN-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_and_or_b32 v0, v2, 1, v0 ; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_and_b64 s[10:11], vcc, -1 ; GCN-NEXT: v_alignbit_b32 v7, v1, v0, 2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_12 ; GCN-NEXT: ; %bb.11: ; %itofp-if-then20 ; GCN-NEXT: v_alignbit_b32 v7, v1, v0, 3 ; GCN-NEXT: v_mov_b32_e32 v4, v5 -; GCN-NEXT: ; %bb.12: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: .LBB1_13: ; %Flow4 +; GCN-NEXT: .LBB1_12: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: .LBB1_13: ; %itofp-if-end26 ; GCN-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 ; GCN-NEXT: v_lshl_or_b32 v0, v4, 23, v0 ; GCN-NEXT: v_add_u32_e32 v0, 1.0, v0 @@ -264,8 +298,8 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: .LBB1_14: ; %Flow5 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB1_14: ; %itofp-return ; GCN-NEXT: v_mov_b32_e32 v0, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i128 %x to bfloat diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index c6aa2182aec80..562a5b6ce65ea 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -9,9 +9,11 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: s_mov_b64 s[6:7], exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_mov_b32_e32 v4, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB0_14 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v3 ; SDAG-NEXT: v_xor_b32_e32 v0, v5, v0 @@ -34,11 +36,13 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_add_u32_e32 v6, 64, v6 ; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc ; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: s_xor_b64 s[4:5], vcc, exec +; SDAG-NEXT: s_and_b64 s[8:9], vcc, -1 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_3 ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -48,18 +52,24 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_13 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB0_3: ; %Flow3 +; SDAG-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB0_8 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_cbranch_execz .LBB0_7 +; SDAG-NEXT: s_mov_b64 s[12:13], exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default ; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v7 ; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 @@ -98,35 +108,42 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v8, v15, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v8 ; SDAG-NEXT: v_mov_b32_e32 v1, v9 -; SDAG-NEXT: .LBB0_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB0_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_8: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB0_10 ; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb ; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB0_10: ; %itofp-sw-epilog ; SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; SDAG-NEXT: v_and_or_b32 v0, v4, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_and_b32_e32 v4, 0x4000000, v0 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SDAG-NEXT: s_mov_b64 s[4:5], exec +; SDAG-NEXT: s_and_b64 s[10:11], vcc, -1 ; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 2 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB0_12 ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 3 ; SDAG-NEXT: v_mov_b32_e32 v2, v6 -; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB0_13: ; %Flow4 +; SDAG-NEXT: .LBB0_12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB0_13: ; %itofp-if-end26 ; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3 ; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 ; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 ; SDAG-NEXT: v_or3_b32 v4, v2, v0, v1 -; SDAG-NEXT: .LBB0_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB0_14: ; %itofp-return ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -136,10 +153,12 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB0_14 +; GISEL-NEXT: s_mov_b32 s8, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_mov_b32_e32 v4, s8 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; GISEL-NEXT: v_xor_b32_e32 v0, v6, v0 @@ -162,11 +181,13 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc ; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v5 -; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v8 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 ; GISEL-NEXT: ; implicit-def: $vgpr4 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_3 ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else ; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -176,18 +197,24 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 -; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB0_13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB0_3: ; %Flow3 +; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock ; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB0_8 +; GISEL-NEXT: s_xor_b64 s[10:11], vcc, exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB0_7 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 ; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4 @@ -230,36 +257,43 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: v_mov_b32_e32 v3, v6 -; GISEL-NEXT: .LBB0_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB0_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] ; GISEL-NEXT: .LBB0_8: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[10:11] +; GISEL-NEXT: s_cbranch_scc0 .LBB0_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB0_10: ; %itofp-sw-epilog ; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_and_b64 s[10:11], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB0_12 ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB0_13: ; %Flow4 +; GISEL-NEXT: .LBB0_12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB0_13: ; %itofp-if-end26 ; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 ; GISEL-NEXT: v_lshl_add_u32 v1, v7, 23, 1.0 ; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 ; GISEL-NEXT: v_or3_b32 v4, v2, v0, v1 -; GISEL-NEXT: .LBB0_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB0_14: ; %itofp-return ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = sitofp i128 %x to float @@ -273,9 +307,11 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: s_mov_b64 s[6:7], exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_mov_b32_e32 v4, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB1_14 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 ; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 @@ -289,11 +325,13 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 ; SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; SDAG-NEXT: v_sub_u32_e32 v5, 0x80, v6 -; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: s_xor_b64 s[4:5], vcc, exec +; SDAG-NEXT: s_and_b64 s[8:9], vcc, -1 +; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 ; SDAG-NEXT: ; implicit-def: $vgpr7 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_3 ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v6 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -303,18 +341,24 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_13 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB1_3: ; %Flow3 +; SDAG-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB1_8 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_cbranch_execz .LBB1_7 +; SDAG-NEXT: s_mov_b64 s[12:13], exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default ; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v6 ; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 @@ -353,34 +397,41 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v7, v14, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v7 ; SDAG-NEXT: v_mov_b32_e32 v1, v8 -; SDAG-NEXT: .LBB1_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB1_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB1_8: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB1_10 ; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb ; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB1_10: ; %itofp-sw-epilog ; SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; SDAG-NEXT: v_and_or_b32 v0, v2, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SDAG-NEXT: s_mov_b64 s[4:5], exec +; SDAG-NEXT: s_and_b64 s[10:11], vcc, -1 ; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 2 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB1_12 ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 3 ; SDAG-NEXT: v_mov_b32_e32 v4, v5 -; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB1_13: ; %Flow4 +; SDAG-NEXT: .LBB1_12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB1_13: ; %itofp-if-end26 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 ; SDAG-NEXT: v_lshl_or_b32 v0, v4, 23, v0 ; SDAG-NEXT: v_add_u32_e32 v4, 1.0, v0 -; SDAG-NEXT: .LBB1_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB1_14: ; %itofp-return ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -390,10 +441,12 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GISEL-NEXT: s_mov_b32 s4, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB1_14 +; GISEL-NEXT: s_mov_b32 s8, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_mov_b32_e32 v4, s8 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 ; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 @@ -407,11 +460,13 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc ; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5 -; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 +; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 ; GISEL-NEXT: ; implicit-def: $vgpr4 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_3 ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else ; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -421,18 +476,24 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 -; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB1_13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB1_3: ; %Flow3 +; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock ; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB1_8 +; GISEL-NEXT: s_xor_b64 s[10:11], vcc, exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB1_7 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4 @@ -475,35 +536,42 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: v_mov_b32_e32 v3, v6 -; GISEL-NEXT: .LBB1_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB1_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] ; GISEL-NEXT: .LBB1_8: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[10:11] +; GISEL-NEXT: s_cbranch_scc0 .LBB1_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB1_10: ; %itofp-sw-epilog ; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_and_b64 s[10:11], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB1_12 ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v6, v7 -; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB1_13: ; %Flow4 +; GISEL-NEXT: .LBB1_12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB1_13: ; %itofp-if-end26 ; GISEL-NEXT: v_lshl_add_u32 v0, v6, 23, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GISEL-NEXT: v_and_or_b32 v4, v4, v1, v0 -; GISEL-NEXT: .LBB1_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB1_14: ; %itofp-return ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i128 %x to float @@ -520,9 +588,11 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v0, v4, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], exec ; SDAG-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB2_14 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; SDAG-NEXT: v_xor_b32_e32 v4, v0, v4 @@ -545,12 +615,14 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_add_u32_e32 v1, 64, v1 ; SDAG-NEXT: v_cndmask_b32_e32 v9, v1, v0, vcc ; SDAG-NEXT: v_sub_u32_e32 v8, 0x80, v9 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v8 +; SDAG-NEXT: s_xor_b64 s[4:5], vcc, exec +; SDAG-NEXT: s_and_b64 s[8:9], vcc, -1 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9 ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_3 ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else ; SDAG-NEXT: v_add_u32_e32 v6, 0xffffffb5, v9 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] @@ -561,18 +633,24 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_13 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB2_3: ; %Flow3 +; SDAG-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v8 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_8 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v8 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_cbranch_execz .LBB2_7 +; SDAG-NEXT: s_mov_b64 s[12:13], exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default ; SDAG-NEXT: v_sub_u32_e32 v12, 0x49, v9 ; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 @@ -616,44 +694,51 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_mov_b32_e32 v5, v1 ; SDAG-NEXT: v_mov_b32_e32 v4, v0 ; SDAG-NEXT: v_mov_b32_e32 v7, v11 -; SDAG-NEXT: .LBB2_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB2_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_8: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_10 ; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb ; SDAG-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; SDAG-NEXT: v_lshrrev_b32_e32 v0, 31, v5 ; SDAG-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; SDAG-NEXT: v_or_b32_e32 v6, v6, v0 -; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB2_10: ; %itofp-sw-epilog ; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v4 ; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v4 ; SDAG-NEXT: v_add_co_u32_e32 v4, vcc, 1, v0 ; SDAG-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; SDAG-NEXT: v_lshrrev_b64 v[0:1], 2, v[4:5] +; SDAG-NEXT: v_and_b32_e32 v9, 0x800000, v5 ; SDAG-NEXT: v_lshlrev_b32_e32 v7, 30, v6 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SDAG-NEXT: s_mov_b64 s[4:5], exec +; SDAG-NEXT: s_and_b64 s[10:11], vcc, -1 ; SDAG-NEXT: v_or_b32_e32 v10, v1, v7 -; SDAG-NEXT: v_and_b32_e32 v1, 0x800000, v5 -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB2_12 ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], 3, v[4:5] ; SDAG-NEXT: v_lshlrev_b32_e32 v2, 29, v6 ; SDAG-NEXT: v_or_b32_e32 v10, v1, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v8 -; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB2_13: ; %Flow4 +; SDAG-NEXT: .LBB2_12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB2_13: ; %itofp-if-end26 ; SDAG-NEXT: v_and_b32_e32 v1, 0x80000000, v3 ; SDAG-NEXT: v_mov_b32_e32 v3, 0x3ff00000 ; SDAG-NEXT: v_lshl_add_u32 v2, v2, 20, v3 ; SDAG-NEXT: v_and_b32_e32 v3, 0xfffff, v10 ; SDAG-NEXT: v_or3_b32 v1, v3, v1, v2 -; SDAG-NEXT: .LBB2_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB2_14: ; %itofp-return ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: sitofp_i128_to_f64: @@ -661,14 +746,16 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_or_b32_e32 v0, v4, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v5, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: s_mov_b64 s[6:7], exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB2_14 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; GISEL-NEXT: v_xor_b32_e32 v0, v6, v4 @@ -691,12 +778,14 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v5, v4, vcc ; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v9 -; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v9 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v8 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v9 ; GISEL-NEXT: ; implicit-def: $vgpr10 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_3 ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else ; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v9 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -706,18 +795,24 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB2_13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB2_3: ; %Flow3 +; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock ; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB2_8 +; GISEL-NEXT: s_xor_b64 s[10:11], vcc, exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB2_7 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v14, 0x49, v9 ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v14 @@ -762,10 +857,14 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: v_mov_b32_e32 v3, v6 -; GISEL-NEXT: .LBB2_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB2_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] ; GISEL-NEXT: .LBB2_8: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[10:11] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] @@ -775,27 +874,30 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v10 ; GISEL-NEXT: v_mov_b32_e32 v2, v11 ; GISEL-NEXT: v_mov_b32_e32 v3, v12 -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB2_10: ; %itofp-sw-epilog ; GISEL-NEXT: v_bfe_u32 v3, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_and_b32_e32 v4, 0x800000, v1 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4] ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v9, 0 -; GISEL-NEXT: v_and_b32_e32 v10, 0x800000, v1 -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_and_b64 s[10:11], vcc, -1 ; GISEL-NEXT: v_lshl_or_b32 v10, v2, 30, v5 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB2_12 ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v7, v8 ; GISEL-NEXT: v_lshl_or_b32 v10, v2, 29, v5 -; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB2_13: ; %Flow4 +; GISEL-NEXT: .LBB2_12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB2_13: ; %itofp-if-end26 ; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GISEL-NEXT: v_mov_b32_e32 v2, 0xfffff @@ -803,8 +905,8 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v2, v10, v2, v0 ; GISEL-NEXT: v_and_or_b32 v0, v4, -1, 0 ; GISEL-NEXT: v_or3_b32 v1, v2, v1, 0 -; GISEL-NEXT: .LBB2_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB2_14: ; %itofp-return ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = sitofp i128 %x to double ret double %cvt @@ -818,9 +920,11 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_mov_b64 s[6:7], exec ; SDAG-NEXT: v_mov_b32_e32 v5, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB3_14 +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 ; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 @@ -834,12 +938,14 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 ; SDAG-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc ; SDAG-NEXT: v_sub_u32_e32 v7, 0x80, v8 -; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v8 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v7 +; SDAG-NEXT: s_xor_b64 s[4:5], vcc, exec +; SDAG-NEXT: s_and_b64 s[8:9], vcc, -1 +; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v8 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_3 ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffffb5, v8 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -850,18 +956,24 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_13 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB3_3: ; %Flow3 +; SDAG-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v7 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB3_8 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_cbranch_execz .LBB3_7 +; SDAG-NEXT: s_mov_b64 s[12:13], exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default ; SDAG-NEXT: v_sub_u32_e32 v11, 0x49, v8 ; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 @@ -905,40 +1017,47 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: v_mov_b32_e32 v3, v10 -; SDAG-NEXT: .LBB3_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB3_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB3_8: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB3_10 ; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb ; SDAG-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; SDAG-NEXT: v_lshrrev_b32_e32 v3, 31, v1 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; SDAG-NEXT: v_or_b32_e32 v2, v2, v3 -; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB3_10: ; %itofp-sw-epilog ; SDAG-NEXT: v_lshrrev_b32_e32 v3, 2, v0 ; SDAG-NEXT: v_and_or_b32 v0, v3, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; SDAG-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; SDAG-NEXT: v_and_b32_e32 v3, 0x800000, v1 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SDAG-NEXT: s_mov_b64 s[4:5], exec +; SDAG-NEXT: s_and_b64 s[10:11], vcc, -1 ; SDAG-NEXT: v_alignbit_b32 v9, v2, v1, 2 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB3_12 ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; SDAG-NEXT: v_alignbit_b32 v9, v2, v1, 3 ; SDAG-NEXT: v_mov_b32_e32 v6, v7 -; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB3_13: ; %Flow4 +; SDAG-NEXT: .LBB3_12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB3_13: ; %itofp-if-end26 ; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v9 ; SDAG-NEXT: v_lshl_or_b32 v0, v6, 20, v0 ; SDAG-NEXT: v_add_u32_e32 v5, 0x3ff00000, v0 -; SDAG-NEXT: .LBB3_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB3_14: ; %itofp-return ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -946,14 +1065,16 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-LABEL: uitofp_i128_to_f64: ; GISEL: ; %bb.0: ; %itofp-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_mov_b64 s[6:7], exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 ; GISEL-NEXT: v_mov_b32_e32 v5, s5 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB3_14 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 ; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 @@ -967,12 +1088,14 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc ; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v8 -; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v8 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v7 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 +; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v8 ; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_3 ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else ; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v8 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -982,18 +1105,24 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: ; implicit-def: $vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB3_13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB3_3: ; %Flow3 +; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock ; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v7 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB3_8 +; GISEL-NEXT: s_xor_b64 s[10:11], vcc, exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB3_7 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v13, 0x49, v8 ; GISEL-NEXT: v_sub_u32_e32 v9, 64, v13 @@ -1039,10 +1168,14 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v9 ; GISEL-NEXT: v_mov_b32_e32 v2, v10 ; GISEL-NEXT: v_mov_b32_e32 v3, v11 -; GISEL-NEXT: .LBB3_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB3_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] ; GISEL-NEXT: .LBB3_8: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[10:11] +; GISEL-NEXT: s_cbranch_scc0 .LBB3_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] @@ -1052,8 +1185,8 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v9 ; GISEL-NEXT: v_mov_b32_e32 v2, v10 ; GISEL-NEXT: v_mov_b32_e32 v3, v11 -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB3_10: ; %itofp-sw-epilog ; GISEL-NEXT: v_bfe_u32 v4, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 @@ -1066,25 +1199,28 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GISEL-NEXT: v_lshlrev_b64 v[8:9], 30, v[2:3] ; GISEL-NEXT: v_lshrrev_b32_e32 v5, 2, v1 +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_and_b64 s[10:11], vcc, -1 ; GISEL-NEXT: v_or_b32_e32 v9, v5, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB3_12 ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshlrev_b64 v[2:3], 29, v[2:3] ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 3, v1 ; GISEL-NEXT: v_or_b32_e32 v9, v0, v2 ; GISEL-NEXT: v_mov_b32_e32 v6, v7 -; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB3_13: ; %Flow4 +; GISEL-NEXT: .LBB3_12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB3_13: ; %itofp-if-end26 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff00000 ; GISEL-NEXT: v_lshl_add_u32 v0, v6, 20, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0xfffff, v9 ; GISEL-NEXT: v_and_or_b32 v4, v4, -1, 0 ; GISEL-NEXT: v_or3_b32 v5, v1, v0, 0 -; GISEL-NEXT: .LBB3_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB3_14: ; %itofp-return ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1099,9 +1235,11 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: s_mov_b64 s[6:7], exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_mov_b32_e32 v4, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB4_14 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB4_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v3 ; SDAG-NEXT: v_xor_b32_e32 v0, v5, v0 @@ -1124,11 +1262,13 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_add_u32_e32 v6, 64, v6 ; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc ; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7 -; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: s_xor_b64 s[4:5], vcc, exec +; SDAG-NEXT: s_and_b64 s[8:9], vcc, -1 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB4_3 ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -1138,18 +1278,24 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB4_13 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB4_3: ; %Flow3 +; SDAG-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB4_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB4_8 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB4_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_cbranch_execz .LBB4_7 +; SDAG-NEXT: s_mov_b64 s[12:13], exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB4_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default ; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v7 ; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 @@ -1188,36 +1334,43 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v8, v15, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v8 ; SDAG-NEXT: v_mov_b32_e32 v1, v9 -; SDAG-NEXT: .LBB4_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB4_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB4_8: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB4_10 ; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb ; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB4_10: ; %itofp-sw-epilog ; SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; SDAG-NEXT: v_and_or_b32 v0, v4, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_and_b32_e32 v4, 0x4000000, v0 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SDAG-NEXT: s_mov_b64 s[4:5], exec +; SDAG-NEXT: s_and_b64 s[10:11], vcc, -1 ; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 2 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB4_12 ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 3 ; SDAG-NEXT: v_mov_b32_e32 v2, v6 -; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB4_13: ; %Flow4 +; SDAG-NEXT: .LBB4_12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB4_13: ; %itofp-if-end26 ; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3 ; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 ; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 ; SDAG-NEXT: v_or3_b32 v0, v2, v0, v1 ; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SDAG-NEXT: .LBB4_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB4_14: ; %itofp-return ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1226,11 +1379,13 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 -; GISEL-NEXT: s_mov_b32 s4, 0 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB4_14 +; GISEL-NEXT: s_mov_b32 s8, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_mov_b32_e32 v4, s8 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB4_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; GISEL-NEXT: v_xor_b32_e32 v0, v6, v0 @@ -1253,11 +1408,13 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc ; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v5 -; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v8 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 ; GISEL-NEXT: ; implicit-def: $vgpr4 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB4_3 ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else ; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -1267,18 +1424,24 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 -; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB4_13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB4_3: ; %Flow3 +; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB4_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock ; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB4_8 +; GISEL-NEXT: s_xor_b64 s[10:11], vcc, exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB4_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB4_7 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB4_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 ; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4 @@ -1321,37 +1484,44 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: v_mov_b32_e32 v3, v6 -; GISEL-NEXT: .LBB4_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB4_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] ; GISEL-NEXT: .LBB4_8: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[10:11] +; GISEL-NEXT: s_cbranch_scc0 .LBB4_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB4_10: ; %itofp-sw-epilog ; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_and_b64 s[10:11], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB4_12 ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v7, v8 -; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB4_13: ; %Flow4 +; GISEL-NEXT: .LBB4_12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB4_13: ; %itofp-if-end26 ; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 ; GISEL-NEXT: v_lshl_add_u32 v1, v7, 23, 1.0 ; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 ; GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 ; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GISEL-NEXT: .LBB4_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB4_14: ; %itofp-return ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = sitofp i128 %x to half @@ -1365,9 +1535,11 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: s_mov_b64 s[6:7], exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 ; SDAG-NEXT: v_mov_b32_e32 v4, 0 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SDAG-NEXT: s_cbranch_execz .LBB5_14 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB5_14 ; SDAG-NEXT: ; %bb.1: ; %itofp-if-end ; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 ; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 @@ -1381,11 +1553,13 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 ; SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; SDAG-NEXT: v_sub_u32_e32 v5, 0x80, v6 -; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: s_xor_b64 s[4:5], vcc, exec +; SDAG-NEXT: s_and_b64 s[8:9], vcc, -1 +; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 ; SDAG-NEXT: ; implicit-def: $vgpr7 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB5_3 ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v6 ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -1395,18 +1569,24 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: ; %bb.3: ; %Flow3 -; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB5_13 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB5_3: ; %Flow3 +; SDAG-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB5_13 ; SDAG-NEXT: ; %bb.4: ; %NodeBlock ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB5_8 +; SDAG-NEXT: s_xor_b64 s[10:11], vcc, exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB5_8 ; SDAG-NEXT: ; %bb.5: ; %LeafBlock ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 -; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc -; SDAG-NEXT: s_cbranch_execz .LBB5_7 +; SDAG-NEXT: s_mov_b64 s[12:13], exec +; SDAG-NEXT: s_and_b64 s[4:5], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB5_7 ; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default ; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v6 ; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 @@ -1445,35 +1625,42 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v7, v14, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v7 ; SDAG-NEXT: v_mov_b32_e32 v1, v8 -; SDAG-NEXT: .LBB5_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB5_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB5_8: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; SDAG-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[10:11] +; SDAG-NEXT: s_cbranch_scc0 .LBB5_10 ; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb ; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB5_10: ; %itofp-sw-epilog ; SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; SDAG-NEXT: v_and_or_b32 v0, v2, 1, v0 ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; SDAG-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SDAG-NEXT: s_mov_b64 s[4:5], exec +; SDAG-NEXT: s_and_b64 s[10:11], vcc, -1 ; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 2 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB5_12 ; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 ; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 3 ; SDAG-NEXT: v_mov_b32_e32 v4, v5 -; SDAG-NEXT: ; %bb.12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; SDAG-NEXT: .LBB5_13: ; %Flow4 +; SDAG-NEXT: .LBB5_12: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: .LBB5_13: ; %itofp-if-end26 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 ; SDAG-NEXT: v_lshl_or_b32 v0, v4, 23, v0 ; SDAG-NEXT: v_add_u32_e32 v0, 1.0, v0 ; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SDAG-NEXT: .LBB5_14: ; %Flow5 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB5_14: ; %itofp-return ; SDAG-NEXT: v_mov_b32_e32 v0, v4 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1482,11 +1669,13 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 -; GISEL-NEXT: s_mov_b32 s4, 0 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GISEL-NEXT: v_mov_b32_e32 v4, s4 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GISEL-NEXT: s_cbranch_execz .LBB5_14 +; GISEL-NEXT: s_mov_b32 s8, 0 +; GISEL-NEXT: s_mov_b64 s[6:7], exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_mov_b32_e32 v4, s8 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB5_14 ; GISEL-NEXT: ; %bb.1: ; %itofp-if-end ; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 ; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 @@ -1500,11 +1689,13 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc ; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5 -; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 ; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GISEL-NEXT: s_and_b64 s[8:9], vcc, -1 +; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 ; GISEL-NEXT: ; implicit-def: $vgpr4 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB5_3 ; GISEL-NEXT: ; %bb.2: ; %itofp-if-else ; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -1514,18 +1705,24 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr2 -; GISEL-NEXT: ; %bb.3: ; %Flow3 -; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB5_13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB5_3: ; %Flow3 +; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB5_13 ; GISEL-NEXT: ; %bb.4: ; %NodeBlock ; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB5_8 +; GISEL-NEXT: s_xor_b64 s[10:11], vcc, exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB5_8 ; GISEL-NEXT: ; %bb.5: ; %LeafBlock ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GISEL-NEXT: s_cbranch_execz .LBB5_7 +; GISEL-NEXT: s_mov_b64 s[12:13], exec +; GISEL-NEXT: s_and_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB5_7 ; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default ; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4 @@ -1568,36 +1765,43 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: v_mov_b32_e32 v3, v6 -; GISEL-NEXT: .LBB5_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB5_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[10:11] ; GISEL-NEXT: .LBB5_8: ; %Flow2 -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_xor_b64 s[4:5], s[10:11], exec +; GISEL-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[10:11] +; GISEL-NEXT: s_cbranch_scc0 .LBB5_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB5_10: ; %itofp-sw-epilog ; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 ; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 ; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 ; GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_and_b64 s[10:11], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB5_12 ; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v6, v7 -; GISEL-NEXT: ; %bb.12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB5_13: ; %Flow4 +; GISEL-NEXT: .LBB5_12: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB5_13: ; %itofp-if-end26 ; GISEL-NEXT: v_lshl_add_u32 v0, v6, 23, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GISEL-NEXT: v_and_or_b32 v0, v4, v1, v0 ; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GISEL-NEXT: .LBB5_14: ; %Flow5 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB5_14: ; %itofp-return ; GISEL-NEXT: v_mov_b32_e32 v0, v4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i128 %x to half diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll index 3e0ad65c49821..c0b3dc53e5b6b 100644 --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -13,31 +13,36 @@ define amdgpu_ps void @return_void(float %0) #0 { ; CHECK-LABEL: return_void: ; CHECK: ; %bb.0: ; %main_body -; CHECK-NEXT: s_mov_b64 s[0:1], exec -; CHECK-NEXT: s_mov_b32 s2, 0x41200000 -; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc -; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: s_cbranch_execz .LBB0_3 +; CHECK-NEXT: s_mov_b64 s[2:3], exec +; CHECK-NEXT: s_mov_b32 s0, 0x41200000 +; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 ; CHECK-NEXT: .LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 +; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; CHECK-NEXT: s_cbranch_scc0 .LBB0_7 ; CHECK-NEXT: ; %bb.2: ; %loop ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: s_mov_b64 vcc, 0 ; CHECK-NEXT: s_branch .LBB0_1 -; CHECK-NEXT: .LBB0_3: ; %Flow1 -; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[2:3] -; CHECK-NEXT: s_cbranch_execz .LBB0_5 -; CHECK-NEXT: ; %bb.4: ; %end +; CHECK-NEXT: ; %bb.3: ; %Flow +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: .LBB0_4: ; %Flow1 +; CHECK-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; CHECK-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[0:1] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 +; CHECK-NEXT: ; %bb.5: ; %end ; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: exp mrt0 v1, v1, v1, v0 done vm -; CHECK-NEXT: .LBB0_5: ; %UnifiedReturnBlock +; CHECK-NEXT: .LBB0_6: ; %UnifiedReturnBlock ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: .LBB0_7: ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm @@ -57,30 +62,35 @@ end: define amdgpu_ps void @return_void_compr(float %0) #0 { ; CHECK-LABEL: return_void_compr: ; CHECK: ; %bb.0: ; %main_body -; CHECK-NEXT: s_mov_b64 s[0:1], exec -; CHECK-NEXT: s_mov_b32 s2, 0x41200000 -; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc -; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: s_cbranch_execz .LBB1_3 +; CHECK-NEXT: s_mov_b64 s[2:3], exec +; CHECK-NEXT: s_mov_b32 s0, 0x41200000 +; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB1_4 ; CHECK-NEXT: .LBB1_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_cbranch_scc0 .LBB1_6 +; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; CHECK-NEXT: s_cbranch_scc0 .LBB1_7 ; CHECK-NEXT: ; %bb.2: ; %loop ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: s_mov_b64 vcc, 0 ; CHECK-NEXT: s_branch .LBB1_1 -; CHECK-NEXT: .LBB1_3: ; %Flow1 -; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[2:3] -; CHECK-NEXT: s_cbranch_execz .LBB1_5 -; CHECK-NEXT: ; %bb.4: ; %end +; CHECK-NEXT: ; %bb.3: ; %Flow +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: .LBB1_4: ; %Flow1 +; CHECK-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; CHECK-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[0:1] +; CHECK-NEXT: s_cbranch_scc0 .LBB1_6 +; CHECK-NEXT: ; %bb.5: ; %end ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: exp mrt0 v0, off, v0, off done compr vm -; CHECK-NEXT: .LBB1_5: ; %UnifiedReturnBlock +; CHECK-NEXT: .LBB1_6: ; %UnifiedReturnBlock ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: .LBB1_6: +; CHECK-NEXT: .LBB1_7: ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm @@ -114,9 +124,9 @@ define amdgpu_ps void @only_kill() #0 { ; CHECK-NEXT: ; %bb.3: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB2_4: -; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: exp null off, off, off, off done vm -; CHECK-NEXT: s_endpgm +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null off, off, off, off done vm +; CHECK-NEXT: s_endpgm main_body: br label %loop @@ -132,27 +142,29 @@ define amdgpu_ps float @return_nonvoid(float %0) #0 { ; CHECK-NEXT: s_mov_b64 s[0:1], exec ; CHECK-NEXT: s_mov_b32 s2, 0x41200000 ; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc -; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: s_cbranch_execz .LBB3_3 +; CHECK-NEXT: s_xor_b64 s[2:3], vcc, exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB3_4 ; CHECK-NEXT: .LBB3_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_cbranch_scc0 .LBB3_4 +; CHECK-NEXT: s_cbranch_scc0 .LBB3_5 ; CHECK-NEXT: ; %bb.2: ; %loop ; CHECK-NEXT: ; in Loop: Header=BB3_1 Depth=1 ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: s_mov_b64 vcc, exec ; CHECK-NEXT: s_cbranch_execnz .LBB3_1 -; CHECK-NEXT: .LBB3_3: ; %Flow1 +; CHECK-NEXT: ; %bb.3: ; %Flow ; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: .LBB3_4: ; %UnifiedReturnBlock ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_branch .LBB3_5 -; CHECK-NEXT: .LBB3_4: +; CHECK-NEXT: s_branch .LBB3_6 +; CHECK-NEXT: .LBB3_5: ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: .LBB3_5: +; CHECK-NEXT: .LBB3_6: main_body: %cmp = fcmp olt float %0, 1.000000e+01 br i1 %cmp, label %end, label %loop diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll index 3b3e107a62967..9e230fe3e42c5 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -165,10 +165,12 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-SDAG-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX8-SDAG-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX8-SDAG-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX8-SDAG-NEXT: s_mov_b32 m0, -1 -; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_2 +; GFX8-SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; GFX8-SDAG-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX8-SDAG-NEXT: ; %bb.1: ; %bb1 ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8 @@ -176,18 +178,21 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-SDAG-NEXT: s_trap 2 +; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-SDAG-NEXT: .LBB2_2: ; %Flow -; GFX8-SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX8-SDAG-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX8-SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; GFX8-SDAG-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-SDAG-NEXT: ; %bb.3: ; %bb0 ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8 +; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8 ; GFX8-SDAG-NEXT: ds_write_b32 v0, v0 -; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-SDAG-NEXT: s_trap 2 +; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-SDAG-NEXT: .LBB2_4: ; %ret -; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 2 ; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8 ; GFX8-SDAG-NEXT: ds_write_b32 v0, v0 @@ -202,9 +207,11 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX8-GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX8-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX8-GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8-GISEL-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GFX8-GISEL-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX8-GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX8-GISEL-NEXT: s_cmov_b64 exec, s[6:7] +; GFX8-GISEL-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX8-GISEL-NEXT: ; %bb.1: ; %bb1 ; GFX8-GISEL-NEXT: s_mov_b64 s[6:7], 0xc8 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 1 @@ -213,19 +220,22 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_trap 2 ; GFX8-GISEL-NEXT: ds_write_b32 v0, v0 +; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-GISEL-NEXT: .LBB2_2: ; %Flow -; GFX8-GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX8-GISEL-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX8-GISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX8-GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX8-GISEL-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-GISEL-NEXT: ; %bb.3: ; %bb0 -; GFX8-GISEL-NEXT: s_mov_b64 s[6:7], 0xc8 +; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], 0xc8 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-GISEL-NEXT: s_mov_b32 m0, -1 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_trap 2 ; GFX8-GISEL-NEXT: ds_write_b32 v0, v0 +; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-GISEL-NEXT: .LBB2_4: ; %ret -; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], 0xc8 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 2 ; GFX8-GISEL-NEXT: s_mov_b32 m0, -1 @@ -242,22 +252,27 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX9-SDAG-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-SDAG-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX9-SDAG-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX9-SDAG-NEXT: s_trap 2 +; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-SDAG-NEXT: .LBB2_2: ; %Flow -; GFX9-SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-SDAG-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-SDAG-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX9-SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-SDAG-NEXT: ; %bb.3: ; %bb0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX9-SDAG-NEXT: s_trap 2 +; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-SDAG-NEXT: .LBB2_4: ; %ret -; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2 ; GFX9-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX9-SDAG-NEXT: s_trap 2 @@ -270,22 +285,27 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GFX9-GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-GISEL-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GFX9-GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-GISEL-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-GISEL-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-GISEL-NEXT: ; %bb.1: ; %bb1 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-GISEL-NEXT: s_trap 2 ; GFX9-GISEL-NEXT: ds_write_b32 v0, v0 +; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-GISEL-NEXT: .LBB2_2: ; %Flow -; GFX9-GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-GISEL-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-GISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX9-GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-GISEL-NEXT: ; %bb.3: ; %bb0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_trap 2 ; GFX9-GISEL-NEXT: ds_write_b32 v0, v0 +; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-GISEL-NEXT: .LBB2_4: ; %ret -; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2 ; GFX9-GISEL-NEXT: s_trap 2 ; GFX9-GISEL-NEXT: ds_write_b32 v0, v0 @@ -298,29 +318,36 @@ define void @func_uses_lds_multi(i1 %cond) { ; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; SDAG-NEXT: s_cbranch_execz .LBB2_2 +; SDAG-NEXT: s_and_b64 s[6:7], s[4:5], exec +; SDAG-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[6:7] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_3 ; SDAG-NEXT: ; %bb.1: ; %bb1 ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: ds_write_b32 v0, v0 -; SDAG-NEXT: s_cbranch_execnz .LBB2_6 -; SDAG-NEXT: .LBB2_2: ; %Flow -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SDAG-NEXT: s_cbranch_execz .LBB2_4 -; SDAG-NEXT: ; %bb.3: ; %bb0 +; SDAG-NEXT: s_cbranch_execnz .LBB2_8 +; SDAG-NEXT: ; %bb.2: ; %bb1 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB2_3: ; %Flow +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; SDAG-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[4:5] +; SDAG-NEXT: s_cbranch_scc0 .LBB2_6 +; SDAG-NEXT: ; %bb.4: ; %bb0 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: ds_write_b32 v0, v0 -; SDAG-NEXT: s_cbranch_execnz .LBB2_6 -; SDAG-NEXT: .LBB2_4: ; %ret -; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: s_cbranch_execnz .LBB2_8 +; SDAG-NEXT: ; %bb.5: ; %bb0 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB2_6: ; %ret ; SDAG-NEXT: v_mov_b32_e32 v0, 2 ; SDAG-NEXT: ds_write_b32 v0, v0 -; SDAG-NEXT: s_cbranch_execnz .LBB2_6 -; SDAG-NEXT: ; %bb.5: ; %ret +; SDAG-NEXT: s_cbranch_execnz .LBB2_8 +; SDAG-NEXT: ; %bb.7: ; %ret ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] -; SDAG-NEXT: .LBB2_6: +; SDAG-NEXT: .LBB2_8: ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: func_uses_lds_multi: @@ -329,24 +356,29 @@ define void @func_uses_lds_multi(i1 %cond) { ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GISEL-NEXT: s_cbranch_execz .LBB2_3 +; GISEL-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GISEL-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GISEL-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[6:7] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_3 ; GISEL-NEXT: ; %bb.1: ; %bb1 ; GISEL-NEXT: s_cbranch_execnz .LBB2_8 ; GISEL-NEXT: ; %bb.2: ; %bb1 ; GISEL-NEXT: v_mov_b32_e32 v0, 1 ; GISEL-NEXT: ds_write_b32 v0, v0 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: .LBB2_3: ; %Flow -; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GISEL-NEXT: s_cbranch_scc0 .LBB2_6 ; GISEL-NEXT: ; %bb.4: ; %bb0 ; GISEL-NEXT: s_cbranch_execnz .LBB2_8 ; GISEL-NEXT: ; %bb.5: ; %bb0 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: ds_write_b32 v0, v0 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB2_6: ; %ret -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: s_cbranch_execnz .LBB2_8 ; GISEL-NEXT: ; %bb.7: ; %ret ; GISEL-NEXT: v_mov_b32_e32 v0, 2 @@ -467,8 +499,10 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX8-SDAG-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX8-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-SDAG-NEXT: s_cbranch_execz .LBB4_2 +; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX8-SDAG-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX8-SDAG-NEXT: s_cmov_b64 exec, vcc +; GFX8-SDAG-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8-SDAG-NEXT: ; %bb.1: ; %use.bb ; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-SDAG-NEXT: s_mov_b32 m0, -1 @@ -479,8 +513,8 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX8-SDAG-NEXT: s_trap 2 ; GFX8-SDAG-NEXT: flat_load_dword v0, v[1:2] glc ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX8-SDAG-NEXT: .LBB4_2: ; %ret ; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-SDAG-NEXT: .LBB4_2: ; %ret ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: func_uses_lds_phi_after: @@ -491,8 +525,10 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX8-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX8-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8-GISEL-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX8-GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX8-GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8-GISEL-NEXT: ; %bb.1: ; %use.bb ; GFX8-GISEL-NEXT: s_mov_b64 s[6:7], 0xc8 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -503,8 +539,8 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX8-GISEL-NEXT: ds_write_b32 v0, v0 ; GFX8-GISEL-NEXT: flat_load_dword v0, v[1:2] glc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: .LBB4_2: ; %ret ; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-GISEL-NEXT: .LBB4_2: ; %ret ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -516,16 +552,18 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-SDAG-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX9-SDAG-NEXT: s_cmov_b64 exec, vcc +; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-SDAG-NEXT: ; %bb.1: ; %use.bb ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-SDAG-NEXT: ds_write_b32 v0, v0 ; GFX9-SDAG-NEXT: s_trap 2 ; GFX9-SDAG-NEXT: global_load_dword v0, v[1:2], off glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: .LBB4_2: ; %ret ; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-SDAG-NEXT: .LBB4_2: ; %ret ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -537,16 +575,18 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX9-GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX9-GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-GISEL-NEXT: ; %bb.1: ; %use.bb ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_trap 2 ; GFX9-GISEL-NEXT: ds_write_b32 v0, v0 ; GFX9-GISEL-NEXT: global_load_dword v0, v[1:2], off glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: .LBB4_2: ; %ret ; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-GISEL-NEXT: .LBB4_2: ; %ret ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -558,8 +598,10 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_and_b32_e32 v3, 1, v3 ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SDAG-NEXT: s_cbranch_execz .LBB4_3 +; SDAG-NEXT: s_mov_b64 s[4:5], exec +; SDAG-NEXT: s_and_b64 s[6:7], vcc, -1 +; SDAG-NEXT: s_cmov_b64 exec, vcc +; SDAG-NEXT: s_cbranch_scc0 .LBB4_3 ; SDAG-NEXT: ; %bb.1: ; %use.bb ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: ds_write_b32 v0, v0 @@ -567,8 +609,8 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; SDAG-NEXT: ; %bb.2: ; %use.bb ; SDAG-NEXT: global_load_dword v0, v[1:2], off glc ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: .LBB4_3: ; %ret ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB4_3: ; %ret ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] ; SDAG-NEXT: .LBB4_4: @@ -582,8 +624,10 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v3, 1, v3 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GISEL-NEXT: s_cbranch_execz .LBB4_3 +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_and_b64 s[6:7], vcc, -1 +; GISEL-NEXT: s_cmov_b64 exec, vcc +; GISEL-NEXT: s_cbranch_scc0 .LBB4_3 ; GISEL-NEXT: ; %bb.1: ; %use.bb ; GISEL-NEXT: s_cbranch_execnz .LBB4_4 ; GISEL-NEXT: ; %bb.2: ; %use.bb @@ -591,8 +635,8 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) { ; GISEL-NEXT: ds_write_b32 v0, v0 ; GISEL-NEXT: global_load_dword v0, v[1:2], off glc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: .LBB4_3: ; %ret ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB4_3: ; %ret ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] ; GISEL-NEXT: .LBB4_4: diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll index d23dee1f02f09..55222eb288c0f 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll @@ -44,11 +44,10 @@ define half @raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -97,11 +96,10 @@ define <2 x half> @raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -150,11 +148,10 @@ define <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] @@ -207,11 +204,10 @@ define half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -260,11 +256,10 @@ define <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll index bdcb77201714a..1575013921d43 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll @@ -44,11 +44,10 @@ define float @raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset( ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -97,11 +96,10 @@ define <2 x float> @raw_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_ ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] @@ -153,11 +151,10 @@ define <3 x float> @raw_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_ ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -211,11 +208,10 @@ define <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_ ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -272,11 +268,10 @@ define float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset( ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -325,11 +320,10 @@ define <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_ ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll index 28059db0bede3..36cfda648e67b 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll @@ -44,11 +44,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i3 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -98,11 +97,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(<4 x i3 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -152,11 +150,10 @@ define float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i3 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -206,11 +203,10 @@ define float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<4 x i3 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -260,11 +256,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) @@ -314,11 +309,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 2, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) @@ -368,11 +362,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 4, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) @@ -422,11 +415,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_dlc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 6, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) @@ -476,11 +468,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_dlc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 5, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) @@ -530,11 +521,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_slc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 7, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) @@ -584,11 +574,10 @@ define <2 x float> @raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] @@ -640,11 +629,10 @@ define <3 x float> @raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2 @@ -698,11 +686,10 @@ define <4 x float> @raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -758,11 +745,10 @@ define half @raw_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 @@ -812,11 +798,10 @@ define <2 x half> @raw_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset( ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -865,11 +850,10 @@ define <4 x half> @raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset( ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] @@ -921,11 +905,10 @@ define float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -976,11 +959,10 @@ define float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_SBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1031,11 +1013,10 @@ define float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1086,11 +1067,10 @@ define float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_SSHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1142,11 +1122,10 @@ define half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 @@ -1197,11 +1176,10 @@ define float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1251,11 +1229,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) @@ -1303,11 +1280,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) @@ -1356,11 +1332,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) @@ -1409,11 +1384,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 16 @@ -1463,11 +1437,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 @@ -1519,11 +1492,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 4096 @@ -1569,11 +1541,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) @@ -1618,11 +1589,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) @@ -1673,11 +1643,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 16 @@ -1729,11 +1698,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 @@ -1785,11 +1753,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 @@ -1842,11 +1809,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 5000 @@ -1899,11 +1865,10 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 904, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 5000 diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll index ed5fa05fa8ed3..6ccd9363c0698 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll @@ -44,11 +44,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -96,11 +95,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFSET_exact [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -149,11 +147,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -207,11 +204,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -266,11 +262,10 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -315,11 +310,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -364,11 +358,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -417,11 +410,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -471,11 +463,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -527,11 +518,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -590,11 +580,10 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY8]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll index e38de72e1f0f1..5df44b6868e53 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll @@ -44,11 +44,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -96,11 +95,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -154,11 +152,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -214,11 +211,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY9]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -276,11 +272,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY10]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -338,11 +333,10 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY10]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -392,11 +386,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY7]], [[COPY]], killed [[REG_SEQUENCE4]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -446,11 +439,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY7]], [[COPY]], killed [[REG_SEQUENCE4]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -504,11 +496,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -563,11 +554,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -624,11 +614,10 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -691,11 +680,10 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY10]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll index 39c58f8f39d59..41e45f5ca9d8e 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll @@ -45,11 +45,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -99,11 +98,10 @@ define void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -153,11 +151,10 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -207,11 +204,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -261,11 +257,10 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(< ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -314,11 +309,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) ret void @@ -367,11 +361,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 2, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void @@ -420,11 +413,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 3, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3) ret void @@ -473,11 +465,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_d ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 4, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) ret void @@ -526,11 +517,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 6, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) ret void @@ -579,11 +569,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 5, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) ret void @@ -632,11 +621,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 7, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) ret void @@ -690,11 +678,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -750,11 +737,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY9]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -812,11 +798,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY10]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -865,11 +850,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_BYTE_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %val.trunc = trunc i32 %val to i8 call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -919,11 +903,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_SHORT_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %val.trunc = trunc i32 %val to i16 call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -974,11 +957,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_SHORT_OFFEN_exact [[COPY7]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1027,11 +1009,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1085,11 +1066,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1143,11 +1123,10 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1195,11 +1174,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4095 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET_exact [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -1248,11 +1226,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4096 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) ret void @@ -1301,11 +1278,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1355,11 +1331,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1411,11 +1386,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1461,11 +1435,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -1510,11 +1483,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -1563,11 +1535,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1617,11 +1588,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1673,11 +1643,10 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1730,11 +1699,10 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_o ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 5000 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1785,11 +1753,10 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_o ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 5000, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll index 5b8bd1f60233b..f7119209c9d93 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll @@ -54,11 +54,10 @@ define half @raw_ptr_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -117,11 +116,10 @@ define <2 x half> @raw_ptr_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sg ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -180,11 +178,10 @@ define <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sg ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] @@ -247,11 +244,10 @@ define half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -310,11 +306,10 @@ define <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sg ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll index 7dabd9a395746..9711282e13a70 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll @@ -54,11 +54,10 @@ define float @raw_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -117,11 +116,10 @@ define <2 x float> @raw_ptr_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] @@ -183,11 +181,10 @@ define <3 x float> @raw_ptr_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s96) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -251,11 +248,10 @@ define <4 x float> @raw_ptr_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -322,11 +318,10 @@ define float @raw_ptr_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -385,11 +380,10 @@ define <4 x float> @raw_ptr_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__s ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll index 1a9f7b1619f4c..2e6e7cc0bbc8c 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll @@ -54,11 +54,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -118,11 +117,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -182,11 +180,10 @@ define float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -246,11 +243,10 @@ define float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -310,11 +306,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 1) @@ -374,11 +369,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 2, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) @@ -438,11 +432,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 4, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 4) @@ -502,11 +495,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 6, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 6) @@ -566,11 +558,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 5, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 5) @@ -630,11 +621,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 7, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 7) @@ -694,11 +684,10 @@ define <2 x float> @raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] @@ -760,11 +749,10 @@ define <3 x float> @raw_ptr_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s96) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2 @@ -828,11 +816,10 @@ define <4 x float> @raw_ptr_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -898,11 +885,10 @@ define half @raw_ptr_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 @@ -962,11 +948,10 @@ define <2 x half> @raw_ptr_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1025,11 +1010,10 @@ define <4 x half> @raw_ptr_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] @@ -1091,11 +1075,10 @@ define float @raw_ptr_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1156,11 +1139,10 @@ define float @raw_ptr_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_SBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1221,11 +1203,10 @@ define float @raw_ptr_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zex ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1286,11 +1267,10 @@ define float @raw_ptr_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sex ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_SSHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1352,11 +1332,10 @@ define half @raw_ptr_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 @@ -1417,11 +1396,10 @@ define float @raw_ptr_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8) from %ir.rsrc, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1481,11 +1459,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) @@ -1543,11 +1520,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) @@ -1606,11 +1582,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4096, i32 %soffset, i32 0) @@ -1669,11 +1644,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_vof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 16 @@ -1733,11 +1707,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 @@ -1799,11 +1772,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 4096 @@ -1859,11 +1831,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 0) @@ -1918,11 +1889,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 0) @@ -1983,11 +1953,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 16 @@ -2049,11 +2018,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 @@ -2115,11 +2083,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 @@ -2182,11 +2149,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %soffset = add i32 %soffset.base, 5000 @@ -2249,11 +2215,10 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_vof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 904, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %voffset = add i32 %voffset.base, 5000 diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll index eada2004161d1..6a336baf251bf 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll @@ -54,11 +54,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -116,11 +115,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFSET_exact [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.f16(half %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -179,11 +177,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -247,11 +244,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -316,11 +312,10 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -375,11 +370,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -434,11 +428,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -497,11 +490,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -561,11 +553,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -627,11 +618,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -700,11 +690,10 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY12]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll index 60db62dc43a61..1ccdd91cd13c4 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll @@ -54,11 +54,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -116,11 +115,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.f32(float %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -184,11 +182,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -254,11 +251,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY13]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v3f32(<3 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -326,11 +322,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY14]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -398,11 +393,10 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY14]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -462,11 +456,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY11]], [[COPY]], killed [[REG_SEQUENCE6]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -526,11 +519,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY11]], [[COPY]], killed [[REG_SEQUENCE6]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -594,11 +586,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -663,11 +654,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -734,11 +724,10 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -811,11 +800,10 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY14]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.ptr.buffer.store.format.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll index 78e29387b1d40..50df38db623a0 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll @@ -55,11 +55,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -119,11 +118,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -183,11 +181,10 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -247,11 +244,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -311,11 +307,10 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -374,11 +369,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 1) ret void @@ -437,11 +431,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 2, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void @@ -500,11 +493,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 3, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 3) ret void @@ -563,11 +555,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 4, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 4) ret void @@ -626,11 +617,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 6, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 6) ret void @@ -689,11 +679,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 5, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 5) ret void @@ -752,11 +741,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 7, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 7) ret void @@ -820,11 +808,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -890,11 +877,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY13]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -962,11 +948,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY14]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1025,11 +1010,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_BYTE_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8) into %ir.rsrc, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %val.trunc = trunc i32 %val to i8 call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 %val.trunc, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1089,11 +1073,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_SHORT_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %val.trunc = trunc i32 %val to i16 call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 %val.trunc, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -1154,11 +1137,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_SHORT_OFFEN_exact [[COPY11]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1217,11 +1199,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1285,11 +1266,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1353,11 +1333,10 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1415,11 +1394,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET_exact [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -1478,11 +1456,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4096, i32 %soffset, i32 0) ret void @@ -1541,11 +1518,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1605,11 +1581,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1671,11 +1646,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1731,11 +1705,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -1790,11 +1763,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -1853,11 +1825,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1917,11 +1888,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -1983,11 +1953,10 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -2050,11 +2019,10 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset.add = add i32 %voffset, 5000 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -2115,11 +2083,10 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY1]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 904, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 5000, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll index 24dc4f1b3c0aa..ce3c1fb0df0da 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll @@ -53,11 +53,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr add ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -116,11 +115,10 @@ define <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.ptr.tbuffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -179,11 +177,10 @@ define <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] @@ -245,11 +242,10 @@ define half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr add ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -308,11 +304,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) @@ -371,11 +366,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) @@ -434,11 +428,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) @@ -497,11 +490,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ptr ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s16) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.ptr.tbuffer.load.f16(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll index 01dc0328f2d2d..632c3ee7f548b 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll @@ -53,11 +53,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ad ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -116,11 +115,10 @@ define <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY10]] @@ -182,11 +180,10 @@ define <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s96) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -250,11 +247,10 @@ define <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -320,11 +316,10 @@ define float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr ad ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -383,11 +378,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(pt ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) @@ -446,11 +440,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(pt ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) @@ -509,11 +502,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) @@ -572,11 +564,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(pt ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll index cd60d5b21faba..4ef2b0f27d82b 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll @@ -54,11 +54,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -117,11 +116,10 @@ define void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -185,11 +183,10 @@ define void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f16(<4 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -249,11 +246,10 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -313,11 +309,10 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -377,11 +372,10 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -440,11 +434,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ha ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret void @@ -503,11 +496,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ha ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret void @@ -566,11 +558,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret void @@ -629,11 +620,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ha ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s16) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret void diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll index 5c67d82c1e977..68ef30ace8255 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll @@ -55,11 +55,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -124,11 +123,10 @@ define void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v2f32(<2 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -195,11 +193,10 @@ define void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY13]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s96) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v3f32(<3 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -268,11 +265,10 @@ define void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY14]], [[COPY1]], killed [[REG_SEQUENCE6]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -332,11 +328,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -396,11 +391,10 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 1, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 1) ret void @@ -460,11 +454,10 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -524,11 +517,10 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -588,11 +580,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(fl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret void @@ -652,11 +643,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(fl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret void @@ -716,11 +706,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret void @@ -780,11 +769,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(fl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret void @@ -842,11 +830,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 94, i32 0) ret void @@ -904,11 +891,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4095, i32 %soffset, i32 94, i32 0) ret void @@ -967,11 +953,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY5]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 4096, i32 %soffset, i32 94, i32 0) ret void @@ -1030,11 +1015,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 16, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 16 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1094,11 +1078,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 4095, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 4095 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1160,11 +1143,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 4096 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1220,11 +1202,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY5]], [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4095, i32 94, i32 0) ret void @@ -1279,11 +1260,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY5]], [[COPY]], killed [[REG_SEQUENCE5]], [[S_MOV_B32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 4096, i32 94, i32 0) ret void @@ -1344,11 +1324,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 16 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1410,11 +1389,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 4095 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1476,11 +1454,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 4096 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1543,11 +1520,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 5000 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1610,11 +1586,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE5]], killed [[V_READFIRSTLANE_B32_4]], 904, 94, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 5000 call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll index bcffca8a3c4fc..d40b0051ad91b 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll @@ -43,11 +43,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i3 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -96,11 +95,10 @@ define <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -149,11 +147,10 @@ define <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] @@ -205,11 +202,10 @@ define half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i3 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -258,11 +254,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) @@ -311,11 +306,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) @@ -364,11 +358,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) @@ -417,11 +410,10 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll index 51e56a47fc2f2..db89af1b334f8 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll @@ -43,11 +43,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -96,11 +95,10 @@ define <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY6]] @@ -152,11 +150,10 @@ define <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -210,11 +207,10 @@ define <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -270,11 +266,10 @@ define float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -323,11 +318,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) @@ -376,11 +370,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) @@ -429,11 +422,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) @@ -482,11 +474,10 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll index a1d8acdb4cc53..d0a3af6b55faa 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll @@ -44,11 +44,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -97,11 +96,10 @@ define void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -155,11 +153,10 @@ define void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -209,11 +206,10 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -263,11 +259,10 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -317,11 +312,10 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(half % ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -370,11 +364,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ha ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret void @@ -423,11 +416,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ha ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret void @@ -476,11 +468,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret void @@ -529,11 +520,10 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ha ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret void diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll index 7c0aa26a8a699..35eee4b857da2 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll @@ -45,11 +45,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -104,11 +103,10 @@ define void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -165,11 +163,10 @@ define void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY9]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -228,11 +225,10 @@ define void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY10]], [[COPY1]], killed [[REG_SEQUENCE4]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -282,11 +278,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -336,11 +331,10 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 1) ret void @@ -390,11 +384,10 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -444,11 +437,10 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -498,11 +490,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(fl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret void @@ -552,11 +543,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(fl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 2, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret void @@ -606,11 +596,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 3, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret void @@ -660,11 +649,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(fl ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 78, 4, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret void @@ -712,11 +700,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 94, i32 0) ret void @@ -764,11 +751,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 94, i32 0) ret void @@ -817,11 +803,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY5]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 94, i32 0) ret void @@ -870,11 +855,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 16, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 16 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -924,11 +908,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 4095, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 4095 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -980,11 +963,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 4096 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1030,11 +1012,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY5]], [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 94, i32 0) ret void @@ -1079,11 +1060,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY5]], [[COPY]], killed [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 94, i32 0) ret void @@ -1134,11 +1114,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 16 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1190,11 +1169,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 4095 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1246,11 +1224,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 4096 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1303,11 +1280,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %soffset = add i32 %soffset.base, 5000 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -1360,11 +1336,10 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY6]], [[V_ADD_U32_e64_]], killed [[REG_SEQUENCE3]], killed [[V_READFIRSTLANE_B32_4]], 904, 94, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN %voffset = add i32 %voffset.base, 5000 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll b/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll index bfd97c53522c9..7d622b8d25b9b 100644 --- a/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll +++ b/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll @@ -26,11 +26,10 @@ define float @llvm_amdgcn_raw_buffer_load_f32(i32 %voffset, i32 %soffset) { ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> poison, i32 %voffset, i32 %soffset, i32 0) @@ -60,11 +59,10 @@ define float @llvm_amdgcn_raw_tbuffer_load_f32(i32 %voffset, i32 %soffset) { ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> poison, i32 %voffset, i32 %soffset, i32 0, i32 0) @@ -94,11 +92,10 @@ define <2 x float> @llvm_amdgcn_raw_buffer_load_v2f32(i32 %voffset, i32 %soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY2]] @@ -131,11 +128,10 @@ define <2 x float> @llvm_amdgcn_raw_tbuffer_load_v2f32(i32 %voffset, i32 %soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY2]] @@ -168,11 +164,10 @@ define <3 x float> @llvm_amdgcn_raw_buffer_load_v3f32(i32 %voffset, i32 %soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2 @@ -207,11 +202,10 @@ define <3 x float> @llvm_amdgcn_raw_tbuffer_load_v3f32(i32 %voffset, i32 %soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -246,11 +240,10 @@ define <4 x float> @llvm_amdgcn_raw_buffer_load_v4f32(i32 %voffset, i32 %soffset ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -287,11 +280,10 @@ define <4 x float> @llvm_amdgcn_raw_tbuffer_load_v4f32(i32 %voffset, i32 %soffse ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -329,11 +321,10 @@ define void @llvm_amdgcn_raw_buffer_store_f32(float %val, i32 %voffset, i32 %sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -363,11 +354,10 @@ define void @llvm_amdgcn_raw_tbuffer_store_f32(float %val, i32 %voffset, i32 %so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY2]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -402,11 +392,10 @@ define void @llvm_amdgcn_raw_buffer_store_v2f32(<2 x float> %val, i32 %voffset, ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF2]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -441,11 +430,10 @@ define void @llvm_amdgcn_raw_tbuffer_store_v2f32(<2 x float> %val, i32 %voffset, ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF2]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -482,11 +470,10 @@ define void @llvm_amdgcn_raw_buffer_store_v3f32(<3 x float> %val, i32 %voffset, ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF3]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -523,11 +510,10 @@ define void @llvm_amdgcn_raw_tbuffer_store_v3f32(<3 x float> %val, i32 %voffset, ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF3]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -566,11 +552,10 @@ define void @llvm_amdgcn_raw_buffer_store_v4f32(<4 x float> %val, i32 %voffset, ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF4]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -609,11 +594,10 @@ define void @llvm_amdgcn_raw_tbuffer_store_v4f32(<4 x float> %val, i32 %voffset, ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF4]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %val, <4 x i32> poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -644,11 +628,10 @@ define float @llvm_amdgcn_raw_ptr_buffer_load_f32(i32 %voffset, i32 %soffset) { ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0) @@ -678,11 +661,10 @@ define float @llvm_amdgcn_raw_ptr_tbuffer_load_f32(i32 %voffset, i32 %soffset) { ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; GFX908-NEXT: SI_RETURN implicit $vgpr0 %val = call float @llvm.amdgcn.raw.ptr.tbuffer.load.f32(ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0, i32 0) @@ -712,11 +694,10 @@ define <2 x float> @llvm_amdgcn_raw_ptr_buffer_load_v2f32(i32 %voffset, i32 %sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY2]] @@ -749,11 +730,10 @@ define <2 x float> @llvm_amdgcn_raw_ptr_tbuffer_load_v2f32(i32 %voffset, i32 %so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; GFX908-NEXT: $vgpr0 = COPY [[COPY2]] @@ -786,11 +766,10 @@ define <3 x float> @llvm_amdgcn_raw_ptr_buffer_load_v3f32(i32 %voffset, i32 %sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s96) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2 @@ -825,11 +804,10 @@ define <3 x float> @llvm_amdgcn_raw_ptr_tbuffer_load_v3f32(i32 %voffset, i32 %so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -864,11 +842,10 @@ define <4 x float> @llvm_amdgcn_raw_ptr_buffer_load_v4f32(i32 %voffset, i32 %sof ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -905,11 +882,10 @@ define <4 x float> @llvm_amdgcn_raw_ptr_tbuffer_load_v4f32(i32 %voffset, i32 %so ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -947,11 +923,10 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_f32(float %val, i32 %voffset, i32 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY2]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -981,11 +956,10 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_f32(float %val, i32 %voffset, i32 ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY2]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.f32(float %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -1020,11 +994,10 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v2f32(<2 x float> %val, i32 %voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF2]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1059,11 +1032,10 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v2f32(<2 x float> %val, i32 %voff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF2]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v2f32(<2 x float> %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -1100,11 +1072,10 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v3f32(<3 x float> %val, i32 %voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF3]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v3f32(<3 x float> %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1141,11 +1112,10 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v3f32(<3 x float> %val, i32 %voff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF3]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v3f32(<3 x float> %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void @@ -1184,11 +1154,10 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v4f32(<4 x float> %val, i32 %voffs ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF4]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0) ret void @@ -1227,11 +1196,10 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v4f32(<4 x float> %val, i32 %voff ; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF4]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into `ptr addrspace(8) poison`, align 1, addrspace 8) - ; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX908-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: - ; GFX908-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; GFX908-NEXT: SI_RETURN call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %val, ptr addrspace(8) poison, i32 %voffset, i32 %soffset, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll index 76cff962f7c20..c3675f4dd5ba8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s ; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll index cee5bbbe85f48..567bc150d6af6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll index 224de9512c493..0a3b95d6eb397 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,PREGFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX10,PREGFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll index 71ed71cd84bcd..ae28843238b21 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll @@ -126,12 +126,16 @@ endif: define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) { ; GISEL-LABEL: inverse_ballot_branch: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: s_xor_b32 s2, s1, -1 -; GISEL-NEXT: s_and_saveexec_b32 s1, s2 +; GISEL-NEXT: s_xor_b32 s1, s1, -1 +; GISEL-NEXT: s_mov_b32 s2, exec_lo +; GISEL-NEXT: s_and_b32 s1, s1, exec_lo +; GISEL-NEXT: s_and_b32 s3, s1, -1 +; GISEL-NEXT: s_cmov_b32 exec_lo, s1 +; GISEL-NEXT: s_cbranch_scc0 .LBB6_2 ; GISEL-NEXT: ; %bb.1: ; %if ; GISEL-NEXT: s_add_i32 s0, s0, 1 -; GISEL-NEXT: ; %bb.2: ; %endif -; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GISEL-NEXT: .LBB6_2: ; %endif ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off ; GISEL-NEXT: s_nop 0 @@ -140,14 +144,18 @@ define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr ; ; SDAG-LABEL: inverse_ballot_branch: ; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_xor_b32 s1, s1, -1 ; SDAG-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-NEXT: s_xor_b32 s2, s1, -1 -; SDAG-NEXT: s_and_saveexec_b32 s1, s2 +; SDAG-NEXT: s_and_b32 s1, s1, exec_lo +; SDAG-NEXT: s_mov_b32 s2, exec_lo +; SDAG-NEXT: s_and_b32 s3, s1, -1 +; SDAG-NEXT: s_cmov_b32 exec_lo, s1 +; SDAG-NEXT: s_cbranch_scc0 .LBB6_2 ; SDAG-NEXT: ; %bb.1: ; %if ; SDAG-NEXT: s_add_i32 s0, s0, 1 ; SDAG-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-NEXT: ; %bb.2: ; %endif -; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; SDAG-NEXT: .LBB6_2: ; %endif ; SDAG-NEXT: global_store_b32 v[0:1], v2, off ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll index 2e3dc11feed1e..e9396e7da51c4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll @@ -210,13 +210,17 @@ endif: define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace(1) %out) { ; GISEL-LABEL: inverse_ballot_branch: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: s_xor_b64 s[4:5], s[2:3], -1 -; GISEL-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GISEL-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GISEL-NEXT: s_and_b64 s[6:7], s[2:3], -1 +; GISEL-NEXT: s_cmov_b64 exec, s[2:3] +; GISEL-NEXT: s_cbranch_scc0 .LBB6_2 ; GISEL-NEXT: ; %bb.1: ; %if ; GISEL-NEXT: s_add_u32 s0, s0, 1 ; GISEL-NEXT: s_addc_u32 s1, s1, 0 -; GISEL-NEXT: ; %bb.2: ; %endif -; GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB6_2: ; %endif ; GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -227,16 +231,20 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr ; SDAG-LABEL: inverse_ballot_branch: ; SDAG: ; %bb.0: ; %entry ; SDAG-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; SDAG-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-NEXT: s_xor_b64 s[4:5], s[2:3], -1 -; SDAG-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; SDAG-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SDAG-NEXT: s_mov_b64 s[4:5], exec +; SDAG-NEXT: s_and_b64 s[6:7], s[2:3], -1 +; SDAG-NEXT: s_cmov_b64 exec, s[2:3] +; SDAG-NEXT: s_cbranch_scc0 .LBB6_2 ; SDAG-NEXT: ; %bb.1: ; %if ; SDAG-NEXT: s_add_u32 s0, s0, 1 ; SDAG-NEXT: s_addc_u32 s1, s1, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, s1 ; SDAG-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-NEXT: ; %bb.2: ; %endif -; SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB6_2: ; %endif ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll index 955d8ae5cc054..9aa28c1d65219 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test1: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index f52461b6b3807..94c08d890a2fa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -529,19 +529,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8DAGISEL-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX8DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8DAGISEL-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX8DAGISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX8DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 @@ -554,8 +557,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -566,18 +569,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-LABEL: divergent_cfg: ; GFX8GISEL: ; %bb.0: ; %entry ; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8GISEL-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX8GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX8GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_mov_b32 s6, s4 +; GFX8GISEL-NEXT: s_mov_b32 s6, s2 +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8GISEL-NEXT: .LBB4_2: ; %Flow -; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX8GISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX8GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX8GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, 0 @@ -588,8 +595,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: ; %bb.5: ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8GISEL-NEXT: .LBB4_6: ; %endif ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -601,19 +609,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-LABEL: divergent_cfg: ; GFX9DAGISEL: ; %bb.0: ; %entry ; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9DAGISEL-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX9DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9DAGISEL-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX9DAGISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 @@ -626,8 +637,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -637,18 +648,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-LABEL: divergent_cfg: ; GFX9GISEL: ; %bb.0: ; %entry ; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9GISEL-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX9GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX9GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_mov_b32 s6, s4 +; GFX9GISEL-NEXT: s_mov_b32 s6, s2 +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9GISEL-NEXT: .LBB4_2: ; %Flow -; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX9GISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX9GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, 0 @@ -659,8 +674,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: ; %bb.5: ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9GISEL-NEXT: .LBB4_6: ; %endif ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -671,19 +687,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-LABEL: divergent_cfg: ; GFX1064DAGISEL: ; %bb.0: ; %entry ; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064DAGISEL-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1064DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1064DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX1064DAGISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1064DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 @@ -696,8 +715,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -708,17 +727,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL: ; %bb.0: ; %entry ; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064GISEL-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1064GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX1064GISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX1064GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1064GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 @@ -729,8 +752,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: ; %bb.5: ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064GISEL-NEXT: .LBB4_6: ; %endif ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -741,19 +765,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-LABEL: divergent_cfg: ; GFX1032DAGISEL: ; %bb.0: ; %entry ; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr3 -; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1032DAGISEL-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1032DAGISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_load_dword s3, s[0:1], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: s_xor_b32 s2, s3, exec_lo +; GFX1032DAGISEL-NEXT: s_and_b32 s4, s3, -1 +; GFX1032DAGISEL-NEXT: s_cmov_b32 exec_lo, s3 +; GFX1032DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, 0 @@ -766,8 +793,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -777,18 +804,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-LABEL: divergent_cfg: ; GFX1032GISEL: ; %bb.0: ; %entry ; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032GISEL-NEXT: s_and_b32 s2, vcc_lo, -1 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr2 -; GFX1032GISEL-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s2, s2 +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s3, s3 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: s_xor_b32 s3, s4, exec_lo +; GFX1032GISEL-NEXT: s_and_b32 s5, s4, -1 +; GFX1032GISEL-NEXT: s_cmov_b32 exec_lo, s4 +; GFX1032GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 @@ -799,8 +830,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_max_u32 s2, s2, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: ; %bb.5: ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032GISEL-NEXT: .LBB4_6: ; %endif ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -810,20 +842,23 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164DAGISEL-LABEL: divergent_cfg: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164DAGISEL-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1164DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1164DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX1164DAGISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1164DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 @@ -837,8 +872,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -849,19 +884,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164GISEL-LABEL: divergent_cfg: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164GISEL-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1164GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX1164GISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX1164GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1164GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 @@ -873,8 +913,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: ; %bb.5: ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164GISEL-NEXT: .LBB4_6: ; %endif ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -886,20 +927,23 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132DAGISEL-LABEL: divergent_cfg: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr3 -; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1132DAGISEL-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1132DAGISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_load_b32 s3, s[0:1], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132DAGISEL-NEXT: s_xor_b32 s2, s3, exec_lo +; GFX1132DAGISEL-NEXT: s_and_b32 s4, s3, -1 +; GFX1132DAGISEL-NEXT: s_cmov_b32 exec_lo, s3 +; GFX1132DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, 0 @@ -913,8 +957,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -925,19 +969,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132GISEL-LABEL: divergent_cfg: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132GISEL-NEXT: s_and_b32 s2, vcc_lo, -1 ; GFX1132GISEL-NEXT: ; implicit-def: $sgpr2 -; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1132GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s2, s2 +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s3, s3 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_xor_b32 s3, s4, exec_lo +; GFX1132GISEL-NEXT: s_and_b32 s5, s4, -1 +; GFX1132GISEL-NEXT: s_cmov_b32 exec_lo, s4 +; GFX1132GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 @@ -949,8 +998,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: ; %bb.5: ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132GISEL-NEXT: .LBB4_6: ; %endif ; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index bfdb2da6dc6a4..5c0e55d2bb493 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -530,19 +530,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8DAGISEL-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX8DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8DAGISEL-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX8DAGISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX8DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX8DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, -1 @@ -555,8 +558,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -567,18 +570,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-LABEL: divergent_cfg: ; GFX8GISEL: ; %bb.0: ; %entry ; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX8GISEL-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX8GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX8GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX8GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_mov_b32 s6, s4 +; GFX8GISEL-NEXT: s_mov_b32 s6, s2 +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8GISEL-NEXT: .LBB4_2: ; %Flow -; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX8GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX8GISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX8GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX8GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, -1 @@ -589,8 +596,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX8GISEL-NEXT: .LBB4_5: ; %endif +; GFX8GISEL-NEXT: ; %bb.5: ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8GISEL-NEXT: .LBB4_6: ; %endif ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -602,19 +610,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-LABEL: divergent_cfg: ; GFX9DAGISEL: ; %bb.0: ; %entry ; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9DAGISEL-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX9DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9DAGISEL-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX9DAGISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, -1 @@ -627,8 +638,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -638,18 +649,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-LABEL: divergent_cfg: ; GFX9GISEL: ; %bb.0: ; %entry ; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9GISEL-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX9GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX9GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX9GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_mov_b32 s6, s4 +; GFX9GISEL-NEXT: s_mov_b32 s6, s2 +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9GISEL-NEXT: .LBB4_2: ; %Flow -; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX9GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX9GISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX9GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, -1 @@ -660,8 +675,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX9GISEL-NEXT: .LBB4_5: ; %endif +; GFX9GISEL-NEXT: ; %bb.5: ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9GISEL-NEXT: .LBB4_6: ; %endif ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -672,19 +688,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-LABEL: divergent_cfg: ; GFX1064DAGISEL: ; %bb.0: ; %entry ; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064DAGISEL-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1064DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1064DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX1064DAGISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1064DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, -1 @@ -697,8 +716,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -709,17 +728,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL: ; %bb.0: ; %entry ; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1064GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1064GISEL-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1064GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX1064GISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX1064GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1064GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, -1 @@ -730,8 +753,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1064GISEL-NEXT: .LBB4_5: ; %endif +; GFX1064GISEL-NEXT: ; %bb.5: ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064GISEL-NEXT: .LBB4_6: ; %endif ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -742,19 +766,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-LABEL: divergent_cfg: ; GFX1032DAGISEL: ; %bb.0: ; %entry ; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr3 -; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032DAGISEL-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1032DAGISEL-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1032DAGISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_load_dword s3, s[0:1], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: s_xor_b32 s2, s3, exec_lo +; GFX1032DAGISEL-NEXT: s_and_b32 s4, s3, -1 +; GFX1032DAGISEL-NEXT: s_cmov_b32 exec_lo, s3 +; GFX1032DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, -1 @@ -767,8 +794,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -778,18 +805,22 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-LABEL: divergent_cfg: ; GFX1032GISEL: ; %bb.0: ; %entry ; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1032GISEL-NEXT: s_and_b32 s2, vcc_lo, -1 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr2 -; GFX1032GISEL-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1032GISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s2, s2 +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s3, s3 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1032GISEL-NEXT: s_xor_b32 s3, s4, exec_lo +; GFX1032GISEL-NEXT: s_and_b32 s5, s4, -1 +; GFX1032GISEL-NEXT: s_cmov_b32 exec_lo, s4 +; GFX1032GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, -1 @@ -800,8 +831,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_min_u32 s2, s2, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1032GISEL-NEXT: .LBB4_5: ; %endif +; GFX1032GISEL-NEXT: ; %bb.5: ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032GISEL-NEXT: .LBB4_6: ; %endif ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -811,20 +843,23 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164DAGISEL-LABEL: divergent_cfg: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164DAGISEL-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1164DAGISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1164DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX1164DAGISEL-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1164DAGISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1164DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1 @@ -838,8 +873,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -850,19 +885,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164GISEL-LABEL: divergent_cfg: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164GISEL-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX1164GISEL-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1164GISEL-NEXT: s_cmov_b64 exec, vcc +; GFX1164GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], exec +; GFX1164GISEL-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX1164GISEL-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1164GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, -1 @@ -874,8 +914,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1164GISEL-NEXT: .LBB4_5: ; %endif +; GFX1164GISEL-NEXT: ; %bb.5: ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164GISEL-NEXT: .LBB4_6: ; %endif ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -887,20 +928,23 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132DAGISEL-LABEL: divergent_cfg: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr3 -; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX1132DAGISEL-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1132DAGISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132DAGISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_load_b32 s3, s[0:1], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132DAGISEL-NEXT: s_xor_b32 s2, s3, exec_lo +; GFX1132DAGISEL-NEXT: s_and_b32 s4, s3, -1 +; GFX1132DAGISEL-NEXT: s_cmov_b32 exec_lo, s3 +; GFX1132DAGISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, -1 @@ -914,8 +958,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -926,19 +970,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132GISEL-LABEL: divergent_cfg: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; GFX1132GISEL-NEXT: s_and_b32 s2, vcc_lo, -1 ; GFX1132GISEL-NEXT: ; implicit-def: $sgpr2 -; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1132GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132GISEL-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1132GISEL-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s2, s2 +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s3, s3 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_xor_b32 s3, s4, exec_lo +; GFX1132GISEL-NEXT: s_and_b32 s5, s4, -1 +; GFX1132GISEL-NEXT: s_cmov_b32 exec_lo, s4 +; GFX1132GISEL-NEXT: s_cbranch_scc0 .LBB4_6 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, -1 @@ -950,8 +999,9 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 -; GFX1132GISEL-NEXT: .LBB4_5: ; %endif +; GFX1132GISEL-NEXT: ; %bb.5: ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132GISEL-NEXT: .LBB4_6: ; %endif ; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll index fdd457ca992ea..0b08dae1a1e50 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll @@ -1,3 +1,4 @@ +; XFAIL: * ;RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SIVI %s ;RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VIPLUS,SIVI %s ;RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VIPLUS,GFX9 %s diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll index 5fb50d7e8589a..531c3e7cd08a4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -176,14 +176,18 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr ; CHECK-LABEL: test_control_flow_0: ; CHECK: ; %bb.0: ; %main_body ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc -; CHECK-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; CHECK-NEXT: s_cbranch_execz .LBB6_2 +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_and_b64 s[2:3], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB6_2 ; CHECK-NEXT: ; %bb.1: ; %ELSE ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: .LBB6_2: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; CHECK-NEXT: s_cbranch_execz .LBB6_4 +; CHECK-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; CHECK-NEXT: s_and_b64 s[4:5], s[0:1], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[0:1] +; CHECK-NEXT: s_cbranch_scc0 .LBB6_4 ; CHECK-NEXT: ; %bb.3: ; %IF ; CHECK-NEXT: v_mov_b32_e32 v0, s12 ; CHECK-NEXT: v_mov_b32_e32 v1, s13 @@ -192,8 +196,8 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_add_f32_e32 v2, v0, v1 ; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: .LBB6_4: ; %END -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: v_mov_b32_e32 v0, v2 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ; return to shader part epilog @@ -225,9 +229,10 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr ; CHECK-NEXT: s_mov_b64 s[14:15], exec ; CHECK-NEXT: s_wqm_b64 exec, exec ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: s_and_saveexec_b64 s[16:17], vcc -; CHECK-NEXT: s_xor_b64 s[16:17], exec, s[16:17] -; CHECK-NEXT: s_cbranch_execz .LBB7_2 +; CHECK-NEXT: s_xor_b64 s[16:17], vcc, exec +; CHECK-NEXT: s_and_b64 s[18:19], vcc, -1 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB7_2 ; CHECK-NEXT: ; %bb.1: ; %ELSE ; CHECK-NEXT: image_sample v1, v0, s[0:7], s[8:11] dmask:0x1 ; CHECK-NEXT: s_and_saveexec_b64 s[18:19], s[14:15] @@ -236,9 +241,12 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen ; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: s_or_b64 exec, exec, s[16:17] ; CHECK-NEXT: .LBB7_2: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[0:1], s[16:17] -; CHECK-NEXT: s_cbranch_execz .LBB7_4 +; CHECK-NEXT: s_xor_b64 s[0:1], s[16:17], exec +; CHECK-NEXT: s_and_b64 s[2:3], s[16:17], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[16:17] +; CHECK-NEXT: s_cbranch_scc0 .LBB7_4 ; CHECK-NEXT: ; %bb.3: ; %IF ; CHECK-NEXT: v_mov_b32_e32 v0, s12 ; CHECK-NEXT: v_mov_b32_e32 v1, s13 @@ -247,8 +255,8 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_add_f32_e32 v2, v0, v1 ; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec -; CHECK-NEXT: .LBB7_4: ; %END ; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: .LBB7_4: ; %END ; CHECK-NEXT: s_and_b64 exec, exec, s[14:15] ; CHECK-NEXT: v_mov_b32_e32 v0, v2 ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll index 4c1ae4c228adb..95271cac16751 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -10,7 +10,6 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX10-LABEL: main: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 @@ -21,13 +20,14 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX10-NEXT: s_and_saveexec_b32 s0, s0 ; GFX10-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX10-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_and_b32 s2, s1, -1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v6 @@ -37,7 +37,6 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; ; GFX9-LABEL: main: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -49,12 +48,13 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr4 -; GFX9-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9-NEXT: ; %bb.2: -; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v6 @@ -63,7 +63,6 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; ; GFX8-LABEL: main: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -75,12 +74,13 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[0:1] +; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v6 @@ -90,7 +90,6 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; ; GFX11-LABEL: main: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -103,12 +102,14 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s2, s1, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6 @@ -116,7 +117,6 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; ; GFX12-LABEL: main: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -129,12 +129,14 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen +; GFX12-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s2, s1, -1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX12-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll index 46b2516f72f8e..521d4d0a097ef 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll @@ -8,7 +8,6 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; GFX10-LABEL: main: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 @@ -19,13 +18,14 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX10-NEXT: s_and_saveexec_b32 s0, s0 ; GFX10-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX10-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_and_b32 s2, s1, -1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v6 @@ -35,7 +35,6 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; ; GFX9-LABEL: main: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -47,12 +46,13 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr4 -; GFX9-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9-NEXT: ; %bb.2: -; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v6 @@ -61,7 +61,6 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; ; GFX8-LABEL: main: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -73,12 +72,13 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[0:1] +; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v6 @@ -88,7 +88,6 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; ; GFX11-LABEL: main: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -101,12 +100,14 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) { ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s2, s1, -1 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v5 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index 2e47cc505ee69..470b958907246 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -159,21 +159,22 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; SI: ; %bb.0: ; %.entry ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: s_mov_b64 s[4:5], exec ; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] -; SI-NEXT: s_xor_b64 s[0:1], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB2_3 +; SI-NEXT: s_and_b64 s[6:7], s[0:1], -1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: s_cmov_b64 exec, s[0:1] +; SI-NEXT: s_cbranch_scc0 .LBB2_3 ; SI-NEXT: ; %bb.1: ; %.demote -; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; SI-NEXT: s_cbranch_scc0 .LBB2_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: .LBB2_3: ; %.continue -; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; SI-NEXT: s_endpgm @@ -186,21 +187,22 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: s_and_b64 s[6:7], s[0:1], -1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_cmov_b64 exec, s[0:1] +; GFX9-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; %.demote -; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_mov_b64 exec, 0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: .LBB2_3: ; %.continue -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX9-NEXT: s_endpgm @@ -213,21 +215,22 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-32: ; %bb.0: ; %.entry ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-32-NEXT: s_mov_b32 s2, exec_lo ; GFX10-32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0 -; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s2 -; GFX10-32-NEXT: s_cbranch_execz .LBB2_3 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_b32 s3, s0, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote -; GFX10-32-NEXT: s_andn2_b32 s1, s1, exec_lo +; GFX10-32-NEXT: s_andn2_b32 s2, s2, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: .LBB2_3: ; %.continue -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-32-NEXT: s_endpgm @@ -240,21 +243,22 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-64: ; %bb.0: ; %.entry ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-64-NEXT: s_mov_b64 s[4:5], exec ; GFX10-64-NEXT: s_mov_b64 s[2:3], exec ; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] -; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[4:5] -; GFX10-64-NEXT: s_cbranch_execz .LBB2_3 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_b64 s[6:7], s[0:1], -1 +; GFX10-64-NEXT: s_cmov_b64 exec, s[0:1] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote -; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GFX10-64-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_mov_b64 exec, 0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: .LBB2_3: ; %.continue -; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm ; GFX10-64-NEXT: s_endpgm @@ -287,17 +291,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc -; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; SI-NEXT: s_cbranch_execz .LBB3_3 +; SI-NEXT: s_mov_b64 s[14:15], exec +; SI-NEXT: s_and_b64 s[16:17], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB3_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: .LBB3_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: .LBB3_3: ; %.continue ; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v0 @@ -316,17 +321,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX9-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-NEXT: s_mov_b64 s[14:15], exec +; GFX9-NEXT: s_and_b64 s[16:17], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: .LBB3_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: .LBB3_3: ; %.continue ; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 @@ -345,17 +351,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 -; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 -; GFX10-32-NEXT: s_cbranch_execz .LBB3_3 +; GFX10-32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-32-NEXT: s_and_b32 s14, vcc_lo, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s14, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10-32-NEXT: .LBB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: .LBB3_3: ; %.continue ; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 @@ -374,17 +381,18 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_mov_b64 s[12:13], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX10-64-NEXT: s_cbranch_execz .LBB3_3 +; GFX10-64-NEXT: s_mov_b64 s[14:15], exec +; GFX10-64-NEXT: s_and_b64 s[16:17], vcc, -1 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: .LBB3_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: .LBB3_3: ; %.continue ; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 @@ -421,19 +429,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; SI-NEXT: s_mov_b64 s[12:13], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; SI-NEXT: s_mov_b64 s[14:15], exec ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc -; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; SI-NEXT: s_cbranch_execz .LBB4_3 +; SI-NEXT: s_and_b64 s[16:17], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB4_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 .LBB4_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: .LBB4_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: .LBB4_3: ; %.continue ; SI-NEXT: v_add_f32_e32 v0, v0, v0 ; SI-NEXT: s_and_b64 exec, exec, s[12:13] ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf @@ -450,19 +459,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_and_b64 s[16:17], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: .LBB4_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: .LBB4_3: ; %.continue ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf @@ -479,19 +489,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_mov_b32 s13, exec_lo ; GFX10-32-NEXT: s_waitcnt vmcnt(0) ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 -; GFX10-32-NEXT: s_cbranch_execz .LBB4_3 +; GFX10-32-NEXT: s_and_b32 s14, vcc_lo, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s14, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10-32-NEXT: .LBB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: .LBB4_3: ; %.continue ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -508,19 +519,20 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_mov_b64 s[12:13], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_mov_b64 s[14:15], exec ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX10-64-NEXT: s_cbranch_execz .LBB4_3 +; GFX10-64-NEXT: s_and_b64 s[16:17], vcc, -1 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB4_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: .LBB4_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: .LBB4_3: ; %.continue ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -665,39 +677,41 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB6_3 +; SI-NEXT: s_xor_b64 s[2:3], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB6_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB6_7 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[4:5] -; SI-NEXT: .LBB6_3: ; %.continue0 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] -; SI-NEXT: s_mov_b64 s[2:3], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; SI-NEXT: .LBB6_3: ; %.continue0 +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 -; SI-NEXT: s_nop 0 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_xor_b64 s[4:5], s[0:1], -1 ; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; SI-NEXT: s_and_b64 exec, exec, s[0:1] ; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_or_b64 s[2:3], s[2:3], vcc -; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB6_6 +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; SI-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; SI-NEXT: s_cmov_b64 exec, s[4:5] +; SI-NEXT: s_cbranch_scc0 .LBB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB6_7 ; SI-NEXT: ; %bb.5: ; %.demote1 ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: .LBB6_6: ; %.continue1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB6_6: ; %.continue1 ; SI-NEXT: v_bfrev_b32_e32 v0, 60 ; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 ; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm @@ -713,39 +727,41 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB6_3: ; %.continue0 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX9-NEXT: .LBB6_3: ; %.continue0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], -1 ; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_6 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX9-NEXT: ; %bb.4: ; %.demote1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX9-NEXT: ; %bb.5: ; %.demote1 ; GFX9-NEXT: s_mov_b64 exec, 0 -; GFX9-NEXT: .LBB6_6: ; %.continue1 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB6_6: ; %.continue1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -761,37 +777,40 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10-32-NEXT: s_cbranch_execz .LBB6_3 +; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10-32-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s2, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 -; GFX10-32-NEXT: .LBB6_3: ; %.continue0 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: s_mov_b32 s1, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 +; GFX10-32-NEXT: .LBB6_3: ; %.continue0 +; GFX10-32-NEXT: s_mov_b32 s2, s0 +; GFX10-32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_xor_b32 s1, s0, -1 -; GFX10-32-NEXT: s_or_b32 s1, s1, vcc_lo -; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 -; GFX10-32-NEXT: s_cbranch_execz .LBB6_6 +; GFX10-32-NEXT: s_xor_b32 s2, s0, -1 +; GFX10-32-NEXT: s_or_b32 s2, s2, vcc_lo +; GFX10-32-NEXT: s_and_b32 s2, s2, exec_lo +; GFX10-32-NEXT: s_and_b32 s3, s2, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX10-32-NEXT: ; %bb.4: ; %.demote1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-32-NEXT: ; %bb.5: ; %.demote1 ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-32-NEXT: .LBB6_6: ; %.continue1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: .LBB6_6: ; %.continue1 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -807,37 +826,40 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-64-NEXT: s_cbranch_execz .LBB6_3 +; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10-64-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: .LBB6_3: ; %.continue0 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; GFX10-64-NEXT: .LBB6_3: ; %.continue0 +; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] ; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_xor_b64 s[2:3], s[0:1], -1 -; GFX10-64-NEXT: s_or_b64 s[2:3], s[2:3], vcc -; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] -; GFX10-64-NEXT: s_cbranch_execz .LBB6_6 +; GFX10-64-NEXT: s_xor_b64 s[4:5], s[0:1], -1 +; GFX10-64-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX10-64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10-64-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX10-64-NEXT: s_cmov_b64 exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_6 ; GFX10-64-NEXT: ; %bb.4: ; %.demote1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 ; GFX10-64-NEXT: ; %bb.5: ; %.demote1 ; GFX10-64-NEXT: s_mov_b64 exec, 0 -; GFX10-64-NEXT: .LBB6_6: ; %.continue1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: .LBB6_6: ; %.continue1 ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -889,44 +911,47 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB7_3 +; SI-NEXT: s_xor_b64 s[2:3], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB7_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[4:5] -; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_branch .LBB7_5 ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_add_i32 s6, s6, 1 ; SI-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB7_8 +; SI-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; SI-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; SI-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; SI-NEXT: s_cbranch_scc0 .LBB7_8 ; SI-NEXT: .LBB7_5: ; %.continue0 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] +; SI-NEXT: s_mov_b64 s[8:9], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[8:9] ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: s_xor_b64 s[4:5], s[0:1], -1 -; SI-NEXT: s_nop 0 +; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 +; SI-NEXT: s_mov_b64 s[4:5], exec ; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] -; SI-NEXT: s_cbranch_execz .LBB7_4 +; SI-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; SI-NEXT: s_cmov_b64 exec, s[8:9] +; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -935,9 +960,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return -; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_and_b64 exec, exec, s[0:1] ; SI-NEXT: v_bfrev_b32_e32 v0, 60 ; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 @@ -955,44 +980,47 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB7_3 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX9-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_branch .LBB7_5 ; GFX9-NEXT: .LBB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_add_i32 s6, s6, 1 ; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execz .LBB7_8 +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX9-NEXT: s_cbranch_scc0 .LBB7_8 ; GFX9-NEXT: .LBB7_5: ; %.continue0 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] +; GFX9-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], -1 +; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[8:9] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; GFX9-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cmov_b64 exec, s[8:9] +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9-NEXT: ; %bb.6: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -1001,9 +1029,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_branch .LBB7_4 ; GFX9-NEXT: .LBB7_8: ; %.return -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 @@ -1021,41 +1049,45 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-32-NEXT: s_mov_b32 s1, 0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 -; GFX10-32-NEXT: s_cbranch_execz .LBB7_3 +; GFX10-32-NEXT: s_xor_b32 s2, vcc_lo, exec_lo +; GFX10-32-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 -; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_mov_b32 s2, 0 ; GFX10-32-NEXT: s_branch .LBB7_5 ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: s_add_i32 s2, s2, 1 ; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 ; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: s_cbranch_execz .LBB7_8 +; GFX10-32-NEXT: s_andn2_b32 s3, exec_lo, s1 +; GFX10-32-NEXT: s_and_b32 s4, s3, -1 +; GFX10-32-NEXT: s_cselect_b32 exec_lo, s3, s1 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_8 ; GFX10-32-NEXT: .LBB7_5: ; %.continue0 ; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-32-NEXT: s_mov_b32 s3, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3 -; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 +; GFX10-32-NEXT: s_mov_b32 s4, s0 +; GFX10-32-NEXT: s_mov_b32 s3, exec_lo +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s4 +; GFX10-32-NEXT: s_xor_b32 s4, s0, -1 ; GFX10-32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_or_b32 s3, s3, vcc_lo -; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 -; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 -; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 +; GFX10-32-NEXT: s_or_b32 s4, s4, vcc_lo +; GFX10-32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-32-NEXT: s_and_b32 s5, s4, -1 +; GFX10-32-NEXT: s_cmov_b32 exec_lo, s4 +; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10-32-NEXT: ; %bb.6: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo @@ -1064,9 +1096,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-32-NEXT: s_wqm_b32 s4, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: s_branch .LBB7_4 ; GFX10-32-NEXT: .LBB7_8: ; %.return -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 @@ -1084,41 +1116,45 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-64-NEXT: s_mov_b32 s6, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-64-NEXT: s_cbranch_execz .LBB7_3 +; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10-64-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX10-64-NEXT: s_cmov_b64 exec, vcc +; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-64-NEXT: s_branch .LBB7_5 ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: s_add_i32 s6, s6, 1 ; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: s_cbranch_execz .LBB7_8 +; GFX10-64-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX10-64-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX10-64-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_8 ; GFX10-64-NEXT: .LBB7_5: ; %.continue0 ; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s6, 0, s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], s[0:1], -1 +; GFX10-64-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-64-NEXT: s_mov_b64 s[4:5], exec +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s6, 0, s[8:9] +; GFX10-64-NEXT: s_xor_b64 s[8:9], s[0:1], -1 ; GFX10-64-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[8:9] -; GFX10-64-NEXT: s_cbranch_execz .LBB7_4 +; GFX10-64-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; GFX10-64-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX10-64-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX10-64-NEXT: s_cmov_b64 exec, s[8:9] +; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX10-64-NEXT: ; %bb.6: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -1127,9 +1163,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 ; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: s_branch .LBB7_4 ; GFX10-64-NEXT: .LBB7_8: ; %.return -; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll index 89abdb2b754a4..8affef90c2ac4 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -37,10 +37,11 @@ define float @lds_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -59,10 +60,11 @@ define float @lds_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst @@ -101,11 +103,12 @@ define void @lds_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: lds_atomic_fadd_noret_f32: @@ -122,11 +125,12 @@ define void @lds_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst ret void @@ -136,33 +140,37 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; VI-LABEL: lds_ds_fadd: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_mov_b64 s[4:5], exec -; VI-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; VI-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; VI-NEXT: s_mov_b64 s[6:7], exec +; VI-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; VI-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s3, s3, 4 +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: s_and_b64 s[8:9], vcc, -1 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: s_and_saveexec_b64 s[6:7], vcc -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB2_2 ; VI-NEXT: ; %bb.1: -; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; VI-NEXT: s_lshl_b32 s8, s3, 3 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: ds_add_rtn_f32 v1, v2, v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: .LBB2_2: -; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_mov_b64 s[6:7], exec +; VI-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 +; VI-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, -1 +; VI-NEXT: s_mov_b64 s[4:5], exec ; VI-NEXT: v_readfirstlane_b32 s8, v1 -; VI-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 -; VI-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB2_4 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB2_4 ; VI-NEXT: ; %bb.3: ; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 @@ -171,8 +179,8 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: ds_add_f32 v2, v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: .LBB2_4: ; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB2_4: ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; VI-NEXT: s_mov_b64 s[4:5], exec @@ -195,17 +203,18 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; VI-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; VI-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: s_xor_b64 s[4:5], vcc, exec +; VI-NEXT: s_and_b64 s[6:7], vcc, -1 ; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_8 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB2_8 ; VI-NEXT: ; %bb.7: ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_add_rtn_f32 v2, v2, v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: .LBB2_8: ; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB2_8: ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_readfirstlane_b32 s2, v2 ; VI-NEXT: v_add_f32_e32 v2, s2, v0 @@ -218,32 +227,36 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX9-LABEL: lds_ds_fadd: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s3, s3, 4 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_lshl_b32 s8, s3, 3 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: .LBB2_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 @@ -252,8 +265,8 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: ds_add_f32 v2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], exec @@ -276,16 +289,17 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr2 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_8 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX9-NEXT: ; %bb.7: ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB2_8: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB2_8: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -303,10 +317,12 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s3, s3, 4 +; GFX7-NEXT: s_mov_b64 s[4:5], exec +; GFX7-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7-NEXT: ; implicit-def: $vgpr1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB2_4 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_lshl_b32 s8, s3, 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 @@ -324,19 +340,22 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GFX7-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX7-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX7-NEXT: ; %bb.3: ; %Flow15 -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: .LBB2_4: ; %Flow16 ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB2_4: ; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, s6, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX7-NEXT: s_mov_b64 s[4:5], exec ; GFX7-NEXT: v_readfirstlane_b32 s8, v1 -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s6, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s7, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB2_7 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX7-NEXT: ; %bb.5: ; GFX7-NEXT: s_lshl_b32 s3, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 @@ -353,18 +372,21 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[6:7] +; GFX7-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB2_6 -; GFX7-NEXT: .LBB2_7: ; %Flow14 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_6 +; GFX7-NEXT: ; %bb.7: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB2_8: ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: ds_read_b32 v1, v2 ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX7-NEXT: v_add_f32_e32 v0, s8, v0 ; GFX7-NEXT: s_mov_b64 s[2:3], 0 -; GFX7-NEXT: .LBB2_8: ; %atomicrmw.start8 +; GFX7-NEXT: .LBB2_9: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v1 @@ -373,10 +395,11 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX7-NEXT: s_cbranch_execnz .LBB2_8 -; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX7-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX7-NEXT: s_cbranch_scc1 .LBB2_9 +; GFX7-NEXT: ; %bb.10: ; %atomicrmw.end7 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -393,10 +416,12 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s3, s3, 4 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB2_4 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_lshl_b32 s8, s3, 3 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 @@ -414,19 +439,22 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB2_2 +; GFX8-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GFX8-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX8-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX8-NEXT: ; %bb.3: ; %Flow17 -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: .LBB2_4: ; %Flow18 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32_e64 v2, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_readfirstlane_b32 s8, v1 -; GFX8-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s7, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_7 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX8-NEXT: ; %bb.5: ; GFX8-NEXT: s_lshl_b32 s3, s3, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -443,18 +471,21 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[10:11], exec, s[6:7] +; GFX8-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB2_6 -; GFX8-NEXT: .LBB2_7: ; %Flow16 +; GFX8-NEXT: s_cselect_b64 exec, s[10:11], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_6 +; GFX8-NEXT: ; %bb.7: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB2_8: ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: ds_read_b32 v1, v2 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, s8, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: .LBB2_8: ; %atomicrmw.start8 +; GFX8-NEXT: .LBB2_9: ; %atomicrmw.start8 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v1 @@ -463,10 +494,11 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB2_8 -; GFX8-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX8-NEXT: s_cbranch_scc1 .LBB2_9 +; GFX8-NEXT: ; %bb.10: ; %atomicrmw.end7 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -489,33 +521,37 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; VI-LABEL: lds_ds_fadd_one_as: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_mov_b64 s[4:5], exec -; VI-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; VI-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; VI-NEXT: s_mov_b64 s[6:7], exec +; VI-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; VI-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s3, s3, 4 +; VI-NEXT: s_mov_b64 s[4:5], exec +; VI-NEXT: s_and_b64 s[8:9], vcc, -1 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: s_and_saveexec_b64 s[6:7], vcc -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB3_2 ; VI-NEXT: ; %bb.1: -; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; VI-NEXT: s_lshl_b32 s8, s3, 3 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: ds_add_rtn_f32 v1, v2, v1 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: .LBB3_2: -; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_mov_b64 s[6:7], exec +; VI-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 +; VI-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, -1 +; VI-NEXT: s_mov_b64 s[4:5], exec ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_readfirstlane_b32 s8, v1 -; VI-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 -; VI-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB3_4 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB3_4 ; VI-NEXT: ; %bb.3: ; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 @@ -523,8 +559,8 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; VI-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: ds_add_f32 v2, v1 -; VI-NEXT: .LBB3_4: ; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB3_4: ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; VI-NEXT: s_mov_b64 s[4:5], exec @@ -547,16 +583,17 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; VI-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; VI-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: s_xor_b64 s[4:5], vcc, exec +; VI-NEXT: s_and_b64 s[6:7], vcc, -1 ; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_8 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB3_8 ; VI-NEXT: ; %bb.7: ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_add_rtn_f32 v2, v2, v1 -; VI-NEXT: .LBB3_8: ; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB3_8: ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_readfirstlane_b32 s2, v2 @@ -569,32 +606,36 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX9-LABEL: lds_ds_fadd_one_as: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s3, s3, 4 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_lshl_b32 s8, s3, 3 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: .LBB3_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_4 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 @@ -602,8 +643,8 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: ds_add_f32 v2, v1 -; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], exec @@ -626,15 +667,16 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_and_b64 s[6:7], vcc, -1 ; GFX9-NEXT: ; implicit-def: $vgpr2 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_8 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB3_8 ; GFX9-NEXT: ; %bb.7: ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX9-NEXT: .LBB3_8: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB3_8: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v2 @@ -652,10 +694,12 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s3, s3, 4 +; GFX7-NEXT: s_mov_b64 s[4:5], exec +; GFX7-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX7-NEXT: ; implicit-def: $vgpr1 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB3_4 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX7-NEXT: ; %bb.1: ; GFX7-NEXT: s_lshl_b32 s8, s3, 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 @@ -673,19 +717,22 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB3_2 +; GFX7-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GFX7-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX7-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX7-NEXT: ; %bb.3: ; %Flow15 -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: .LBB3_4: ; %Flow16 ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB3_4: ; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v2, s6, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX7-NEXT: s_mov_b64 s[4:5], exec ; GFX7-NEXT: v_readfirstlane_b32 s8, v1 -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s6, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s7, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7-NEXT: s_cbranch_execz .LBB3_7 +; GFX7-NEXT: s_cmov_b64 exec, vcc +; GFX7-NEXT: s_cbranch_scc0 .LBB3_8 ; GFX7-NEXT: ; %bb.5: ; GFX7-NEXT: s_lshl_b32 s3, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 @@ -702,18 +749,21 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 s[10:11], exec, s[6:7] +; GFX7-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB3_6 -; GFX7-NEXT: .LBB3_7: ; %Flow14 +; GFX7-NEXT: s_cselect_b64 exec, s[10:11], s[6:7] +; GFX7-NEXT: s_cbranch_scc1 .LBB3_6 +; GFX7-NEXT: ; %bb.7: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: .LBB3_8: ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: ds_read_b32 v1, v2 ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX7-NEXT: v_add_f32_e32 v0, s8, v0 ; GFX7-NEXT: s_mov_b64 s[2:3], 0 -; GFX7-NEXT: .LBB3_8: ; %atomicrmw.start8 +; GFX7-NEXT: .LBB3_9: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v1 @@ -722,10 +772,11 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX7-NEXT: s_cbranch_execnz .LBB3_8 -; GFX7-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX7-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX7-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX7-NEXT: s_cbranch_scc1 .LBB3_9 +; GFX7-NEXT: ; %bb.10: ; %atomicrmw.end7 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -742,10 +793,12 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s3, s3, 4 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB3_4 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB3_4 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_lshl_b32 s8, s3, 3 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 @@ -763,19 +816,22 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB3_2 +; GFX8-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GFX8-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX8-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX8-NEXT: ; %bb.3: ; %Flow17 -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: .LBB3_4: ; %Flow18 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB3_4: ; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32_e64 v2, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_readfirstlane_b32 s8, v1 -; GFX8-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s7, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB3_7 +; GFX8-NEXT: s_cmov_b64 exec, vcc +; GFX8-NEXT: s_cbranch_scc0 .LBB3_8 ; GFX8-NEXT: ; %bb.5: ; GFX8-NEXT: s_lshl_b32 s3, s3, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -792,18 +848,21 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 s[10:11], exec, s[6:7] +; GFX8-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB3_6 -; GFX8-NEXT: .LBB3_7: ; %Flow16 +; GFX8-NEXT: s_cselect_b64 exec, s[10:11], s[6:7] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_6 +; GFX8-NEXT: ; %bb.7: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB3_8: ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: ds_read_b32 v1, v2 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, s8, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], 0 -; GFX8-NEXT: .LBB3_8: ; %atomicrmw.start8 +; GFX8-NEXT: .LBB3_9: ; %atomicrmw.start8 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v1 @@ -812,10 +871,11 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_cbranch_execnz .LBB3_8 -; GFX8-NEXT: ; %bb.9: ; %atomicrmw.end7 -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX8-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX8-NEXT: s_cbranch_scc1 .LBB3_9 +; GFX8-NEXT: ; %bb.10: ; %atomicrmw.end7 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -852,10 +912,11 @@ define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB4_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB4_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: lds_atomic_fadd_ret_f64: @@ -874,10 +935,11 @@ define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_ret_f64: @@ -897,10 +959,11 @@ define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: lds_atomic_fadd_ret_f64: @@ -920,10 +983,11 @@ define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret double %result @@ -945,11 +1009,12 @@ define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v2, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB5_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB5_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: lds_atomic_fadd_noret_f64: @@ -966,11 +1031,12 @@ define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_noret_f64: @@ -988,11 +1054,12 @@ define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: lds_atomic_fadd_noret_f64: @@ -1010,11 +1077,12 @@ define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret void @@ -1036,10 +1104,11 @@ define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwin ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB6_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB6_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1057,10 +1126,11 @@ define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwin ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1079,10 +1149,11 @@ define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwin ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1101,10 +1172,11 @@ define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwin ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, float %val seq_cst @@ -1126,11 +1198,12 @@ define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwi ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v2, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB7_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB7_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: lds_atomic_fsub_noret_f32: @@ -1146,11 +1219,12 @@ define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwi ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fsub_noret_f32: @@ -1167,11 +1241,12 @@ define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwi ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7-NEXT: v_mov_b32_e32 v2, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: lds_atomic_fsub_noret_f32: @@ -1188,11 +1263,12 @@ define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwi ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX8-NEXT: v_mov_b32_e32 v2, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, float %val seq_cst ret void @@ -1215,10 +1291,11 @@ define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounw ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB8_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB8_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: v_mov_b32_e32 v1, v4 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1238,10 +1315,11 @@ define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounw ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1262,10 +1340,11 @@ define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounw ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1286,10 +1365,11 @@ define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounw ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1313,11 +1393,12 @@ define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounw ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] ; VI-NEXT: v_mov_b32_e32 v3, v5 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v4, v6 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB9_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB9_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: lds_atomic_fsub_noret_f64: @@ -1334,11 +1415,12 @@ define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounw ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] ; GFX9-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v4, v6 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fsub_noret_f64: @@ -1356,11 +1438,12 @@ define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounw ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7-NEXT: v_mov_b32_e32 v4, v6 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: lds_atomic_fsub_noret_f64: @@ -1378,11 +1461,12 @@ define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounw ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] ; GFX8-NEXT: v_mov_b32_e32 v3, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX8-NEXT: v_mov_b32_e32 v4, v6 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, double %val seq_cst ret void @@ -1420,10 +1504,11 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB10_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB10_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1456,10 +1541,11 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1489,10 +1575,11 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1523,10 +1610,11 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1565,11 +1653,12 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v3, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB11_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB11_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: lds_atomic_fadd_noret_bf16: @@ -1600,11 +1689,12 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; GFX9-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_noret_bf16: @@ -1632,11 +1722,12 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: lds_atomic_fadd_noret_bf16: @@ -1664,11 +1755,12 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX8-NEXT: v_mov_b32_e32 v3, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, bfloat 4.0 seq_cst ret void @@ -1707,10 +1799,11 @@ define float @lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspace ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1729,10 +1822,11 @@ define float @lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspace ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 @@ -1771,11 +1865,12 @@ define void @lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrspac ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: @@ -1792,11 +1887,12 @@ define void @lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrspac ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 ret void @@ -1820,10 +1916,11 @@ define <2 x half> @lds_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB14_1 +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB14_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1841,10 +1938,11 @@ define <2 x half> @lds_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> % ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1869,24 +1967,25 @@ define <2 x half> @lds_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> % ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1911,24 +2010,25 @@ define <2 x half> @lds_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> % ; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v4 ; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX8-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val seq_cst @@ -1952,11 +2052,12 @@ define void @lds_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %val) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; VI-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; VI-NEXT: v_mov_b32_e32 v2, v3 -; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB15_1 +; VI-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; VI-NEXT: s_cbranch_scc1 .LBB15_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: lds_atomic_fadd_noret_v2f16: @@ -1972,11 +2073,12 @@ define void @lds_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %val) ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_noret_v2f16: @@ -2000,24 +2102,25 @@ define void @lds_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %val) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: lds_atomic_fadd_noret_v2f16: @@ -2041,24 +2144,25 @@ define void @lds_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %val) ; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v2 ; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX8-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v5 -; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val seq_cst ret void @@ -2099,10 +2203,11 @@ define <2 x bfloat> @lds_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bflo ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] -; VI-NEXT: s_cbranch_execnz .LBB16_1 +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; VI-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; VI-NEXT: s_cbranch_scc1 .LBB16_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2138,10 +2243,11 @@ define <2 x bfloat> @lds_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bflo ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB16_1 +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2174,13 +2280,14 @@ define <2 x bfloat> @lds_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bflo ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2213,13 +2320,14 @@ define <2 x bfloat> @lds_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bflo ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst @@ -2260,11 +2368,12 @@ define void @lds_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %v ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; VI-NEXT: s_and_b64 s[8:9], s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v3, v4 -; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] -; VI-NEXT: s_cbranch_execnz .LBB17_1 +; VI-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; VI-NEXT: s_cbranch_scc1 .LBB17_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: lds_atomic_fadd_noret_v2bf16: @@ -2298,11 +2407,12 @@ define void @lds_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %v ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-NEXT: s_and_b64 s[10:11], s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_noret_v2bf16: @@ -2333,13 +2443,14 @@ define void @lds_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %v ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: lds_atomic_fadd_noret_v2bf16: @@ -2370,13 +2481,14 @@ define void @lds_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %v ; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX8-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll index cc90d03e66715..ef4ad07a0ac45 100644 --- a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll +++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll @@ -150,6 +150,7 @@ bb3: define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: min_long_forward_vbranch: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -159,17 +160,18 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN-NEXT: s_cbranch_execnz .LBB3_1 +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GCN-NEXT: v_addc_u32_e64 v1, s[0:1], 0, v1, s[0:1] +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN-NEXT: ; %bb.3: ; %bb -; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[8:9] ; GCN-NEXT: .Lpost_getpc2: -; GCN-NEXT: s_add_u32 s4, s4, (.LBB3_2-.Lpost_getpc2)&4294967295 -; GCN-NEXT: s_addc_u32 s5, s5, (.LBB3_2-.Lpost_getpc2)>>32 -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s8, s8, (.LBB3_2-.Lpost_getpc2)&4294967295 +; GCN-NEXT: s_addc_u32 s9, s9, (.LBB3_2-.Lpost_getpc2)>>32 +; GCN-NEXT: s_setpc_b64 s[8:9] ; GCN-NEXT: .LBB3_1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; 32 bytes @@ -178,8 +180,8 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB3_2: ; %bb3 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 diff --git a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll index 2d3c03bbe5317..b9b6e6851a755 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll @@ -17,11 +17,12 @@ define <3 x float> @liveout_undef_subrange(<3 x float> %arg) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_cmp_neq_f32_e32 vcc, 0, v2 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; CHECK-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mul_f32_e32 v2, v3, v2 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 diff --git a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll index 546022b4f9c43..4314ab133401b 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s @@ -27,12 +28,13 @@ define void @loop_on_argument(i1 %arg) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_and_b64 s[6:7], exec, vcc ; CHECK-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; CHECK-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; CHECK-NEXT: global_store_dword v[0:1], v0, off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll index 634390ba33caf..eebb275c9c032 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck -check-prefix=GCN %s @@ -61,8 +62,10 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 { ; GCN-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB0_1 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB0_1 ; GCN-NEXT: ; %bb.5: ; %bb9 ; GCN-NEXT: s_endpgm bb: @@ -140,10 +143,11 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB1_1 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB1_1 ; GCN-NEXT: ; %bb.4: ; %bb9 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 7 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 @@ -232,10 +236,11 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB2_1 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB2_1 ; GCN-NEXT: ; %bb.4: ; %bb9 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 7 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 @@ -321,10 +326,11 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB3_1 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN-NEXT: ; %bb.4: ; %bb9 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 7 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 @@ -410,10 +416,11 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB4_1 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB4_1 ; GCN-NEXT: ; %bb.4: ; %bb9 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 7 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 @@ -504,10 +511,11 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[8:9] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB5_1 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB5_1 ; GCN-NEXT: ; %bb.4: ; %bb9 -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 7 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll index a407cd20bf762..4cf7fc3bc6149 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll @@ -15,25 +15,27 @@ define void @needs_and(i32 %arg) { ; GCN-NEXT: s_branch .LBB0_2 ; GCN-NEXT: .LBB0_1: ; %endif ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_and_b64 s[4:5], exec, vcc ; GCN-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GCN-NEXT: s_add_i32 s10, s10, 1 -; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB0_4 +; GCN-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GCN-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-NEXT: .LBB0_2: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_cmp_gt_u32_e64 s[4:5], s10, v0 +; GCN-NEXT: s_mov_b64 s[8:9], exec +; GCN-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v0 -; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB0_1 ; GCN-NEXT: ; %bb.3: ; %then ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_nop 1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_branch .LBB0_1 ; GCN-NEXT: .LBB0_4: ; %loopexit -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: @@ -71,11 +73,12 @@ define void @doesnt_need_and(i32 %arg) { ; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[4:5] +; GCN-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4 -; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB1_1 +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GCN-NEXT: s_cbranch_scc1 .LBB1_1 ; GCN-NEXT: ; %bb.2: ; %loopexit -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: @@ -107,23 +110,26 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) { ; GCN-NEXT: s_branch .LBB2_2 ; GCN-NEXT: .LBB2_1: ; %endif ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] ; GCN-NEXT: s_add_i32 s10, s10, 1 -; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB2_4 +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[6:7] +; GCN-NEXT: s_and_b64 s[12:13], s[8:9], -1 +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB2_4 ; GCN-NEXT: .LBB2_2: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, s10, v0 -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_cbranch_execz .LBB2_1 +; GCN-NEXT: s_mov_b64 s[8:9], exec +; GCN-NEXT: s_and_b64 s[12:13], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB2_1 ; GCN-NEXT: ; %bb.3: ; %then ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_branch .LBB2_1 ; GCN-NEXT: .LBB2_4: ; %loopexit -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir index 9eeec4fa3a93d..cbbb933a298d0 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir +++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir @@ -15,32 +15,33 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY]], implicit $exec - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_AND_B32_]], [[COPY1]], implicit-def dead $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_]] - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc + ; CHECK-NEXT: dead [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], 4294967295, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[V_CMP_NE_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, %3, implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, [[S_XOR_B32_]], implicit-def $scc + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[S_OR_SAVEEXEC_B32_:%[0-9]+]]:sreg_32 = S_OR_SAVEEXEC_B32 [[S_XOR_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[S_OR_SAVEEXEC_B32_]], implicit-def $scc - ; CHECK-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_B32_1]], implicit-def $scc - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_XOR_B32_]], $exec_lo, implicit-def $scc + ; CHECK-NEXT: dead [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_XOR_B32_]], 4294967295, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[S_XOR_B32_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.4, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, [[S_XOR_B32_1]], implicit-def $scc ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -52,11 +53,11 @@ body: | S_BRANCH %bb.2 bb.1: - SI_END_CF killed %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 bb.2: successors: %bb.3(0x80000000) + SI_WAVE_RECONVERGE %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: @@ -68,6 +69,7 @@ body: | bb.4: successors: %bb.1(0x80000000) + SI_WAVE_RECONVERGE killed %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.1 ... @@ -94,12 +96,13 @@ body: | ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[V_CMP_GT_I32_e64_]], implicit-def $scc ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[COPY2]], implicit-def $scc ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_OR_B32_]] - ; CHECK-NEXT: $exec_lo = S_ANDN2_B32_term $exec_lo, [[S_OR_B32_]], implicit-def $scc - ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 $exec_lo, [[S_OR_B32_]], implicit-def $scc + ; CHECK-NEXT: dead [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ANDN2_B32_]], 4294967295, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CSELECT_B32_term [[S_ANDN2_B32_]], [[S_OR_B32_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_OR_B32_]], implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x80000000) @@ -120,7 +123,6 @@ body: | S_BRANCH %bb.2 bb.2: - SI_END_CF killed %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -137,27 +139,33 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[V_CMP_NGT_F32_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_NGT_F32_e64 0, 0, 0, [[COPY]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[V_CMP_NGT_F32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_]] - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; CHECK-NEXT: dead [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NGT_F32_e64_]], 4294967295, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[V_CMP_NGT_F32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_CMP_NLT_F32_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_NLT_F32_e64 0, 0, 0, [[COPY]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY2]], [[V_CMP_NLT_F32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_1]] - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; CHECK-NEXT: dead [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NLT_F32_e64_]], 4294967295, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[V_CMP_NLT_F32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, [[COPY2]], implicit-def $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, [[COPY1]], implicit-def $scc + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY1]], implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x40000000), %bb.4(0x40000000) @@ -177,15 +185,15 @@ body: | bb.2: successors: %bb.3(0x80000000) + SI_WAVE_RECONVERGE killed %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.4(0x80000000) + SI_WAVE_RECONVERGE killed %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - SI_END_CF killed %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.4: - SI_END_CF killed %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -203,11 +211,11 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY]], implicit $exec - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_]] - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; CHECK-NEXT: dead [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], 4294967295, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[V_CMP_NE_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.6(0x80000000) @@ -219,11 +227,17 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE %9, %subreg.sub0, %9, %subreg.sub1, %9, %subreg.sub2, %9, %subreg.sub3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY %11 ; CHECK-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY6]], [[REG_SEQUENCE]], 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, %12, implicit-def $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, [[COPY1]], implicit-def $scc ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: @@ -232,7 +246,6 @@ body: | ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY1]], implicit-def $scc ; CHECK-NEXT: S_BRANCH %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: @@ -252,17 +265,16 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.7 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: - ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY2]], 0, implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 0, [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $exec_lo, implicit-def $exec_lo - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY9]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: dead [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_AND_B32_1]], [[COPY9]], implicit-def dead $scc - ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_AND_B32_1]] - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc + ; CHECK-NEXT: dead [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_EQ_U32_e64_]], 4294967295, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_CMOV_B32_term [[V_CMP_EQ_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.3 bb.0: successors: %bb.1(0x40000000), %bb.5(0x40000000) liveins: $vgpr0 @@ -288,11 +300,12 @@ body: | %23:sgpr_128 = REG_SEQUENCE killed %19, %subreg.sub0, %19, %subreg.sub1, %19, %subreg.sub2, %19, %subreg.sub3 %24:vgpr_32 = COPY killed %4 BUFFER_ATOMIC_ADD_OFFSET killed %24, killed %23, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + SI_WAVE_RECONVERGE killed %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.5(0x80000000) + SI_WAVE_RECONVERGE killed %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - SI_END_CF killed %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.5 bb.4: @@ -301,7 +314,6 @@ body: | bb.5: successors: %bb.4(0x80000000) - SI_END_CF killed %0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.4 bb.6: diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir index 02e3d7e81fd40..7668965fe14c4 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir +++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir @@ -5,7 +5,7 @@ # name used for a copy, so some of the check variable names were # manually fixed. -# Check for LiveVariables verifier error after lowering SI_END_CF +# Check for LiveVariables verifier error after lowering SI_WAVE_RECONVERGE --- name: live_variables_update_block_split @@ -21,42 +21,39 @@ body: | ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, killed [[COPY]], implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_MOV_B32_e32_]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY3]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY3]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY $exec + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_MOV_B64_term]] - ; CHECK-NEXT: $exec = S_OR_B64_term $exec, killed [[COPY4]], implicit-def $scc - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] - ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY4]], implicit $exec ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] - ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %10:vreg_64, [[COPY6]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY8]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY8]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] + ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %8:vreg_64, [[COPY5]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_1]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]] + ; CHECK-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc + ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: successors: %bb.2(0x40000000), %bb.1(0x40000000) liveins: $vgpr0 @@ -70,20 +67,22 @@ body: | bb.1: successors: %bb.2(0x80000000) - %4:sreg_64_xexec = PHI %5, %bb.2, %3, %bb.0 - %6:vgpr_32 = PHI %7, %bb.2, %1, %bb.0 - SI_END_CF killed %4, implicit-def $exec, implicit-def dead $scc, implicit $exec + %6:vgpr_32 = PHI %7, %bb.21, %1, %bb.0 %8:vgpr_32 = nsw V_ADD_U32_e32 1, killed %6, implicit $exec bb.2: - successors: %bb.2(0x40000000), %bb.1(0x40000000) + successors: %bb.2(0x40000000), %bb.21(0x40000000) %9:vgpr_32 = PHI %8, %bb.1, %7, %bb.2, %1, %bb.0 GLOBAL_STORE_DWORD undef %10:vreg_64, %9, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) %7:vgpr_32 = COPY killed %9 - %5:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %5:sreg_64_xexec = SI_IF %2, %bb.21, implicit-def $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.2 + bb.21: + SI_WAVE_RECONVERGE %3, implicit-def $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + ... # Here %4 which is not a phi join reg has its last use in bb.2. When @@ -102,48 +101,44 @@ body: | ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, killed [[COPY]], implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_MOV_B32_e32_]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY3]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY3]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY $exec + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_MOV_B64_term]] ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.4(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY5]] - ; CHECK-NEXT: $exec = S_OR_B64_term $exec, killed [[COPY6]], implicit-def $scc - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY4]], implicit $exec ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] - ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %11:vreg_64, [[COPY7]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY9]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY9]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] + ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %8:vreg_64, [[COPY5]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_1]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]] + ; CHECK-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc + ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: successors: %bb.3(0x40000000), %bb.1(0x40000000) liveins: $vgpr0 @@ -157,26 +152,27 @@ body: | bb.1: successors: %bb.2(0x80000000) - %4:sreg_64_xexec = PHI %5, %bb.3, %3, %bb.0 - %6:vgpr_32 = PHI %7, %bb.3, %1, %bb.0 + %6:vgpr_32 = PHI %7, %bb.31, %1, %bb.0 S_BRANCH %bb.2 bb.2: successors: %bb.3(0x80000000) - %8:sreg_64_xexec = COPY %4 - SI_END_CF killed %8, implicit-def $exec, implicit-def dead $scc, implicit $exec %9:vgpr_32 = nsw V_ADD_U32_e32 1, killed %6, implicit $exec bb.3: - successors: %bb.3(0x40000000), %bb.1(0x40000000) + successors: %bb.3(0x40000000), %bb.31(0x40000000) %10:vgpr_32 = PHI %9, %bb.2, %7, %bb.3, %1, %bb.0 GLOBAL_STORE_DWORD undef %11:vreg_64, %10, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) %7:vgpr_32 = COPY killed %10 - %5:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %5:sreg_64_xexec = SI_IF %2, %bb.31, implicit-def $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 + bb.31: + SI_WAVE_RECONVERGE %3, implicit-def $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + ... # Check we don't get "Block should not be in AliveBlocks" for @@ -195,44 +191,41 @@ body: | ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, killed [[COPY]], implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_MOV_B32_e32_]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY3]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY3]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY $exec + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_MOV_B64_term]] - ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 - ; CHECK-NEXT: $exec = S_OR_B64_term $exec, killed [[COPY4]], implicit-def $scc - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 ; CHECK-NEXT: S_NOP 0, implicit killed [[S_MOV_B64_]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] - ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]] + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY4]], implicit $exec ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] - ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %11:vreg_64, [[COPY6]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY8]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY8]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]] + ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %9:vreg_64, [[COPY5]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_1]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]] + ; CHECK-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc + ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: liveins: $vgpr0 @@ -243,20 +236,22 @@ body: | S_BRANCH %bb.2 bb.1: - %4:sreg_64_xexec = PHI %5, %bb.2, %3, %bb.0 - %6:vgpr_32 = PHI %7, %bb.2, %1, %bb.0 + %6:vgpr_32 = PHI %7, %bb.21, %1, %bb.0 %8:sreg_64 = S_MOV_B64 1 - SI_END_CF killed %4, implicit-def $exec, implicit-def dead $scc, implicit $exec S_NOP 0, implicit killed %8 %9:vgpr_32 = nsw V_ADD_U32_e32 1, killed %6, implicit $exec bb.2: - successors: %bb.2(0x40000000), %bb.1(0x40000000) + successors: %bb.2(0x40000000), %bb.21(0x40000000) %10:vgpr_32 = PHI %9, %bb.1, %7, %bb.2, %1, %bb.0 GLOBAL_STORE_DWORD undef %11:vreg_64, %10, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1) %7:vgpr_32 = COPY killed %10 - %5:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %5:sreg_64_xexec = SI_IF %2, %bb.21, implicit-def $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.2 + bb.21: + SI_WAVE_RECONVERGE %3, implicit-def $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + ... diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir index f4e26aeae6766..c31d1fe71955e 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir +++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir @@ -1,3 +1,4 @@ +; XFAIL: * # RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -start-before=livevars -stop-after=twoaddressinstruction -verify-machineinstrs -o - %s 2>&1 | FileCheck %s # CHECK: *** Bad machine code: LiveVariables: Block missing from AliveBlocks *** @@ -26,7 +27,7 @@ body: | %4:sreg_64_xexec = PHI %5, %bb.3, %3, %bb.0 %6:vgpr_32 = PHI %7, %bb.3, %1, %bb.0 %8:sreg_64 = S_MOV_B64 1 - SI_END_CF killed %4, implicit-def $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE killed %4, implicit-def $exec, implicit-def dead $scc, implicit $exec %9:vgpr_32 = nsw V_ADD_U32_e32 1, killed %6, implicit $exec bb.2: diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir index 914cc8ae8844c..ecf76a55eda2c 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir +++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir @@ -21,13 +21,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -68,12 +67,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -109,15 +108,15 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_B64_]], implicit-def $scc + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -157,9 +156,11 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec - ; CHECK-NEXT: $exec = S_ANDN2_B64_term $exec, [[V_CMP_EQ_U32_e64_]], implicit-def $scc + ; CHECK-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 $exec, [[V_CMP_EQ_U32_e64_]], implicit-def $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_ANDN2_B64_]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CSELECT_B64_term [[S_ANDN2_B64_]], [[V_CMP_EQ_U32_e64_]], implicit $scc ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: @@ -209,40 +210,34 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 killed [[COPY]], killed [[COPY1]], implicit $exec - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; CHECK-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec ; CHECK-NEXT: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]] ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, implicit $exec :: (volatile load (s32), addrspace 1) - ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]] + ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] - ; CHECK-NEXT: $exec = S_OR_B64_term $exec, killed [[COPY4]], implicit-def $scc - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] ; CHECK-NEXT: S_SLEEP 1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY5]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY5]], implicit-def dead $scc - ; CHECK-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] + ; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], $exec, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_1]], -1, implicit-def $scc + ; CHECK-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_1]], implicit $scc ; CHECK-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec ; CHECK-NEXT: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 @@ -261,7 +256,6 @@ body: | bb.2: %12:sreg_64_xexec = COPY %14 - SI_END_CF killed %12, implicit-def $exec, implicit-def dead $scc, implicit $exec S_SLEEP 1 %9:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec %14:sreg_64_xexec = S_MOV_B64_term %9, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir b/llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir index c5e2ba5d8c7cb..faea7bebdc8fc 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir +++ b/llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir @@ -42,13 +42,13 @@ body: | ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[PHI1]], [[PHI2]], implicit $exec ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], $exec_lo, implicit-def $scc + ; CHECK-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_AND_B32_]], %bb.1, [[S_OR_B32_]], %bb.2 ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.1, [[V_OR_B32_e64_]], %bb.2 - ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[PHI3]] @@ -63,6 +63,7 @@ body: | ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI1]], killed [[S_MOV_B32_5]], implicit-def dead $scc ; CHECK-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, -1, implicit-def $scc + ; CHECK-NEXT: SI_WAVE_RECONVERGE [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000) @@ -70,15 +71,12 @@ body: | ; CHECK-NEXT: [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_4]], %bb.3, [[S_XOR_B32_1]], %bb.4 ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[COPY8]], %bb.3, [[PHI4]], %bb.4 ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.3, [[S_OR_B32_1]], %bb.4 - ; CHECK-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[PHI5]] ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY9]], [[PHI]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: - ; CHECK-NEXT: [[PHI8:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.5 - ; CHECK-NEXT: SI_END_CF [[PHI8]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x80000000) @@ -114,13 +112,13 @@ body: | %21:vgpr_32 = V_OR_B32_e64 %15, %17, implicit $exec %22:sreg_32 = S_MOV_B32 -1 %23:vreg_1 = COPY %22, implicit $exec + SI_WAVE_RECONVERGE %20, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.4(0x40000000), %bb.5(0x40000000) %24:vgpr_32 = PHI %17, %bb.1, %21, %bb.2 %25:vreg_1 = PHI %7, %bb.1, %23, %bb.2 - SI_END_CF %20, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %26:sreg_32 = S_MOV_B32 -1 %27:sreg_32 = IMPLICIT_DEF %28:sreg_32 = COPY %25 @@ -136,6 +134,7 @@ body: | %33:sreg_32 = S_OR_B32 %15, killed %32, implicit-def dead $scc %34:sreg_32 = S_MOV_B32 0 %35:vreg_1 = COPY %34, implicit $exec + SI_WAVE_RECONVERGE %31, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: successors: %bb.6(0x04000000), %bb.1(0x7c000000) @@ -143,15 +142,12 @@ body: | %18:vgpr_32 = PHI %29, %bb.3, %24, %bb.4 %16:sreg_32 = PHI %27, %bb.3, %33, %bb.4 %36:vreg_1 = PHI %30, %bb.3, %35, %bb.4 - SI_END_CF %31, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %37:sreg_32 = COPY %36 %14:sreg_32 = SI_IF_BREAK %37, %13, implicit-def dead $scc SI_LOOP %14, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.6 bb.6: - %38:sreg_32 = PHI %14, %bb.5 - SI_END_CF %38, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir index efa21052e3ae2..27a7c82ab0bde 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir @@ -38,13 +38,13 @@ body: | ; GFX9-NEXT: [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX9-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec ; GFX9-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.2: ; GFX9-NEXT: successors: %bb.3(0x80000000) ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1 ; GFX9-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1 - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.3: ; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]] @@ -74,11 +74,11 @@ body: | bb.1: %20:vgpr_32 = V_ADD_F32_e32 %10, %11, implicit $mode, implicit $exec %21:vgpr_32 = V_ADD_F32_e32 %13, %14, implicit $mode, implicit $exec + SI_WAVE_RECONVERGE %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1 %23:vgpr_32 = PHI %4, %bb.0, %21, %bb.1 - SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: S_ENDPGM 0, implicit %22, implicit %23 @@ -120,13 +120,13 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec ; GFX9-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.2: ; GFX9-NEXT: successors: %bb.3(0x80000000) ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1 ; GFX9-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1 - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.3: ; GFX9-NEXT: [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec @@ -157,11 +157,11 @@ body: | bb.1: %20:vgpr_32 = V_ADD_F32_e32 %10, %11, implicit $mode, implicit $exec %21:vgpr_32 = V_ADD_F32_e32 %13, %14, implicit $mode, implicit $exec + SI_WAVE_RECONVERGE %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1 %23:vgpr_32 = PHI %4, %bb.0, %21, %bb.1 - SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: %24:vgpr_32 = V_ADD_F32_e32 %14, %11, implicit $mode, implicit $exec @@ -205,13 +205,13 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec ; GFX9-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.2: ; GFX9-NEXT: successors: %bb.3(0x80000000) ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_ADD_F32_e32_]], %bb.1 ; GFX9-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, [[V_ADD_F32_e32_1]], %bb.1 - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.3: ; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]] @@ -242,11 +242,11 @@ body: | bb.1: %20:vgpr_32 = V_ADD_F32_e32 %10, %11, implicit $mode, implicit $exec %21:vgpr_32 = V_ADD_F32_e32 %13, %14, implicit $mode, implicit $exec + SI_WAVE_RECONVERGE %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1 %23:vgpr_32 = PHI %4, %bb.0, %21, %bb.1 - SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: S_ENDPGM 0, implicit %22, implicit %23 @@ -279,11 +279,12 @@ body: | ; GFX9-NEXT: bb.1: ; GFX9-NEXT: successors: %bb.2(0x80000000) ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.2: ; GFX9-NEXT: successors: %bb.3(0x80000000) ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]] - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.3: ; GFX9-NEXT: S_ENDPGM 0, implicit %6 @@ -306,10 +307,10 @@ body: | S_BRANCH %bb.1 bb.1: + SI_WAVE_RECONVERGE %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: S_NOP 0, implicit %6 - SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: S_ENDPGM 0, implicit %9 @@ -348,6 +349,7 @@ body: | ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.3: ; GFX9-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) @@ -356,7 +358,6 @@ body: | ; GFX9-NEXT: [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX9-NEXT: [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX9-NEXT: S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]] - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.4: @@ -412,13 +413,13 @@ body: | liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc S_NOP 0 + SI_WAVE_RECONVERGE %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.4(0x40000000), %bb.6(0x40000000) liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc S_NOP 0, implicit %6, implicit %7 - SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_CBRANCH_EXECZ %bb.6, implicit $exec bb.4: @@ -481,12 +482,12 @@ body: | ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.3: ; GFX9-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.4: @@ -543,12 +544,12 @@ body: | liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc S_NOP 0 + SI_WAVE_RECONVERGE %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.4(0x40000000), %bb.6(0x40000000) liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc - SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_CBRANCH_EXECZ %bb.6, implicit $exec bb.4: @@ -620,13 +621,13 @@ body: | ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: S_NOP 0 + ; GFX9-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: S_BRANCH %bb.4 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: bb.4: ; GFX9-NEXT: successors: %bb.5(0x40000000), %bb.7(0x40000000) ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; GFX9-NEXT: S_BRANCH %bb.5 ; GFX9-NEXT: {{ $}} @@ -695,13 +696,13 @@ body: | liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc S_NOP 0 + SI_WAVE_RECONVERGE %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.4 bb.4: successors: %bb.5(0x40000000), %bb.7(0x40000000) liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc - SI_END_CF %5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_CBRANCH_EXECZ %bb.7, implicit $exec S_BRANCH %bb.5 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir index 04c80582f6f07..ea10d5b8ffb9d 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir @@ -40,7 +40,6 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: @@ -53,6 +52,7 @@ body: | ; CHECK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc ; CHECK-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc + ; CHECK-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000) @@ -60,7 +60,6 @@ body: | ; CHECK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4 ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4 ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4 - ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.6 @@ -69,7 +68,6 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5 - ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.3 @@ -107,7 +105,6 @@ body: | S_BRANCH %bb.4 bb.3: - SI_END_CF %12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 bb.4: @@ -120,6 +117,7 @@ body: | %49:sreg_32 = S_ANDN2_B32 %45, $exec_lo, implicit-def $scc %50:sreg_32 = S_AND_B32 %30, $exec_lo, implicit-def $scc %46:sreg_32 = S_OR_B32 %49, %50, implicit-def $scc + SI_WAVE_RECONVERGE %4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.5: successors: %bb.6(0x04000000), %bb.2(0x7c000000) @@ -127,7 +125,6 @@ body: | %10:sreg_32 = PHI %45, %bb.2, %46, %bb.4 %8:sreg_32 = PHI %39, %bb.2, %40, %bb.4 %9:vgpr_32 = PHI %36, %bb.2, %6, %bb.4 - SI_END_CF %4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %11:sreg_32 = SI_IF_BREAK %10, %2, implicit-def dead $scc %12:sreg_32 = SI_IF_BREAK %8, %0, implicit-def dead $scc SI_LOOP %11, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -137,7 +134,6 @@ body: | successors: %bb.3(0x04000000), %bb.1(0x7c000000) %13:vgpr_32 = PHI %9, %bb.5 - SI_END_CF %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec SI_LOOP %12, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 ... diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll index b8e74bc7db09a..f0ddfda61ac5e 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll @@ -10,35 +10,39 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v1, 1, v1 ; CHECK-NEXT: v_and_b32_e32 v3, 1, v3 -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: v_cmp_eq_u32_e64 s4, 1, v1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; CHECK-NEXT: s_xor_b32 s6, s4, -1 +; CHECK-NEXT: s_xor_b32 s5, s4, -1 ; CHECK-NEXT: s_inst_prefetch 0x1 ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_1: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: v_add_nc_u32_e32 v4, -4, v4 -; CHECK-NEXT: .LBB0_2: ; %Flow1 -; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: .LBB0_2: ; %for.end121 +; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; j lastloop entry ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_or_b32 s5, s4, s5 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; CHECK-NEXT: s_cbranch_execz .LBB0_8 +; CHECK-NEXT: s_or_b32 s6, s4, s6 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s6 +; CHECK-NEXT: s_and_b32 s7, s4, -1 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s6 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_8 ; CHECK-NEXT: .LBB0_3: ; %for.body33 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_6 Depth 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: s_and_saveexec_b32 s7, s6 -; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: s_and_b32 s4, s5, exec_lo +; CHECK-NEXT: s_mov_b32 s7, exec_lo +; CHECK-NEXT: s_and_b32 s8, s4, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.4: ; %for.body51.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b32 s8, 0 @@ -47,7 +51,6 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_5: ; %if.end118 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_add_i32 s9, s9, 4 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; backedge @@ -55,24 +58,29 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: v_add_nc_u32_e32 v4, s9, v2 ; CHECK-NEXT: v_cmp_ge_u32_e64 s4, v4, v0 ; CHECK-NEXT: s_or_b32 s8, s4, s8 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 -; CHECK-NEXT: s_cbranch_execz .LBB0_1 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s8 +; CHECK-NEXT: s_and_b32 s10, s4, -1 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s8 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_1 ; CHECK-NEXT: .LBB0_6: ; %for.body51 ; CHECK-NEXT: ; Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v3, 1 -; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo -; CHECK-NEXT: s_cbranch_execz .LBB0_5 +; CHECK-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: s_and_b32 s11, s10, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, s10 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_5 ; CHECK-NEXT: ; %bb.7: ; %if.then112 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 ; CHECK-NEXT: s_add_i32 s10, s9, 4 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v4, s10 ; CHECK-NEXT: ds_write_b32 v1, v4 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_branch .LBB0_5 ; CHECK-NEXT: .LBB0_8: ; %for.body159.preheader ; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_mov_b32 vcc_lo, exec_lo ; CHECK-NEXT: .LBB0_9: ; %for.body159 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir index 037a285794120..bcfd13b698082 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir @@ -50,7 +50,6 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.4 - ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]] ; CHECK-NEXT: S_BRANCH %bb.2 @@ -64,7 +63,7 @@ body: | ; CHECK-NEXT: bb.7: ; CHECK-NEXT: successors: %bb.7(0x40000000), %bb.8(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit undef $vcc + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit undef $vcc_lo ; CHECK-NEXT: S_BRANCH %bb.8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.8: @@ -96,7 +95,6 @@ body: | bb.5: %7:vgpr_32 = PHI %0, %bb.4 - SI_END_CF %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec INLINEASM &"", 1, implicit %5 S_BRANCH %bb.2 @@ -161,7 +159,6 @@ body: | ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.4 - ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], [[COPY1]], implicit-def dead $scc ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[S_ADD_I32_]] ; CHECK-NEXT: S_BRANCH %bb.2 @@ -175,7 +172,7 @@ body: | ; CHECK-NEXT: bb.7: ; CHECK-NEXT: successors: %bb.7(0x40000000), %bb.8(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit undef $vcc + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit undef $vcc_lo ; CHECK-NEXT: S_BRANCH %bb.8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.8: @@ -207,7 +204,6 @@ body: | bb.5: %7:vgpr_32 = PHI %0, %bb.4 - SI_END_CF %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec INLINEASM &"", 1, implicit %5 S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 6672568b98a20..a91c1d5158914 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -112,8 +112,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_mov_b32_e32 v42, v0 ; CHECK-NEXT: s_mov_b32 s42, exec_lo -; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42 -; CHECK-NEXT: s_cbranch_execz .LBB0_25 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v42 +; CHECK-NEXT: s_and_b32 s4, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_26 ; CHECK-NEXT: ; %bb.1: ; %.preheader5 ; CHECK-NEXT: v_mul_lo_u32 v0, v41, 14 ; CHECK-NEXT: s_mov_b32 s4, 0 @@ -125,62 +127,88 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42 ; CHECK-NEXT: ds_write_b8 v1, v45 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execnz .LBB0_2 +; CHECK-NEXT: s_andn2_b32 s6, exec_lo, s4 +; CHECK-NEXT: s_and_b32 s7, s6, -1 +; CHECK-NEXT: s_cselect_b32 exec_lo, s6, s4 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42 -; CHECK-NEXT: s_mov_b32 s43, 0 +; CHECK-NEXT: s_mov_b32 s43, exec_lo +; CHECK-NEXT: s_mov_b32 s48, 0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45 -; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo -; CHECK-NEXT: s_cbranch_execz .LBB0_25 +; CHECK-NEXT: s_and_b32 s4, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_25 ; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0 ; CHECK-NEXT: v_mov_b32_e32 v47, 0 -; CHECK-NEXT: s_mov_b32 s49, 0 -; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1 -; CHECK-NEXT: ; Child Loop BB0_8 Depth 2 -; CHECK-NEXT: ; Child Loop BB0_20 Depth 2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s49, v44 -; CHECK-NEXT: s_lshl_b32 s4, s49, 5 -; CHECK-NEXT: s_add_i32 s48, s49, 1 -; CHECK-NEXT: s_add_i32 s5, s49, 5 -; CHECK-NEXT: v_or3_b32 v57, s4, v43, s48 +; CHECK-NEXT: s_mov_b32 s52, 0 +; CHECK-NEXT: s_branch .LBB0_7 +; CHECK-NEXT: .LBB0_5: ; %Flow43 +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1 +; CHECK-NEXT: s_inst_prefetch 0x2 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 +; CHECK-NEXT: .LBB0_6: ; %Flow44 +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s49, v45 +; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 +; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 +; CHECK-NEXT: s_mov_b32 s52, s49 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 +; CHECK-NEXT: s_or_b32 s48, s4, s48 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s48 +; CHECK-NEXT: s_and_b32 s5, s4, -1 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s48 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_24 +; CHECK-NEXT: .LBB0_7: ; =>This Loop Header: Depth=1 +; CHECK-NEXT: ; Child Loop BB0_10 Depth 2 +; CHECK-NEXT: ; Child Loop BB0_22 Depth 2 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s52, v44 +; CHECK-NEXT: s_add_i32 s5, s52, 5 +; CHECK-NEXT: s_lshl_b32 s4, s52, 5 +; CHECK-NEXT: s_add_i32 s49, s52, 1 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, s5, v42 ; CHECK-NEXT: ds_read_u8 v0, v0 -; CHECK-NEXT: v_mov_b32_e32 v58, s48 -; CHECK-NEXT: s_mov_b32 s52, exec_lo +; CHECK-NEXT: v_or3_b32 v57, s4, v43, s49 +; CHECK-NEXT: v_mov_b32_e32 v58, s49 +; CHECK-NEXT: s_mov_b32 s53, exec_lo +; CHECK-NEXT: s_and_b32 s4, vcc_lo, -1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v56, 0xff, v0 -; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42 -; CHECK-NEXT: s_cbranch_execz .LBB0_17 -; CHECK-NEXT: ; %bb.6: ; %.preheader2 -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_mov_b32 s53, 0 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_19 +; CHECK-NEXT: ; %bb.8: ; %.preheader2 +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1 ; CHECK-NEXT: s_mov_b32 s54, 0 -; CHECK-NEXT: s_branch .LBB0_8 -; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 -; CHECK-NEXT: s_add_i32 s54, s54, 4 -; CHECK-NEXT: s_add_i32 s4, s49, s54 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s54, v57 +; CHECK-NEXT: s_mov_b32 s55, 0 +; CHECK-NEXT: s_branch .LBB0_10 +; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_10 Depth=2 +; CHECK-NEXT: s_add_i32 s55, s55, 4 +; CHECK-NEXT: s_add_i32 s4, s52, s55 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v57 ; CHECK-NEXT: s_add_i32 s5, s4, 5 ; CHECK-NEXT: s_add_i32 s4, s4, 1 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42 ; CHECK-NEXT: v_mov_b32_e32 v58, s4 -; CHECK-NEXT: s_or_b32 s53, vcc_lo, s53 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53 -; CHECK-NEXT: s_cbranch_execz .LBB0_16 -; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1 +; CHECK-NEXT: s_or_b32 s54, vcc_lo, s54 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s54 +; CHECK-NEXT: s_and_b32 s5, s4, -1 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s54 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_18 +; CHECK-NEXT: .LBB0_10: ; Parent Loop BB0_7 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_add_nc_u32_e32 v59, s54, v46 -; CHECK-NEXT: v_add_nc_u32_e32 v58, s54, v57 -; CHECK-NEXT: s_mov_b32 s55, exec_lo +; CHECK-NEXT: v_add_nc_u32_e32 v59, s55, v46 +; CHECK-NEXT: v_add_nc_u32_e32 v58, s55, v57 +; CHECK-NEXT: s_mov_b32 s56, exec_lo ; CHECK-NEXT: ds_read_u8 v0, v59 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 -; CHECK-NEXT: s_cbranch_execz .LBB0_10 -; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, v56, v0 +; CHECK-NEXT: s_and_b32 s4, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_12 +; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -197,14 +225,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 -; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56 +; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:1 -; CHECK-NEXT: s_mov_b32 s55, exec_lo +; CHECK-NEXT: s_mov_b32 s56, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 -; CHECK-NEXT: s_cbranch_execz .LBB0_12 -; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, v56, v0 +; CHECK-NEXT: s_and_b32 s4, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_14 +; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -222,14 +252,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 -; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56 +; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:2 -; CHECK-NEXT: s_mov_b32 s55, exec_lo +; CHECK-NEXT: s_mov_b32 s56, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 -; CHECK-NEXT: s_cbranch_execz .LBB0_14 -; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, v56, v0 +; CHECK-NEXT: s_and_b32 s4, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_16 +; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -247,14 +279,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 -; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56 +; CHECK-NEXT: .LBB0_16: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:3 -; CHECK-NEXT: s_mov_b32 s55, exec_lo +; CHECK-NEXT: s_mov_b32 s56, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 -; CHECK-NEXT: s_cbranch_execz .LBB0_7 -; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2 +; CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, v56, v0 +; CHECK-NEXT: s_and_b32 s4, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_9 +; CHECK-NEXT: ; %bb.17: ; in Loop: Header=BB0_10 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -272,40 +306,44 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 -; CHECK-NEXT: s_branch .LBB0_7 -; CHECK-NEXT: .LBB0_16: ; %Flow45 -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56 +; CHECK-NEXT: s_branch .LBB0_9 +; CHECK-NEXT: .LBB0_18: ; %Flow45 +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v57, v0 -; CHECK-NEXT: .LBB0_17: ; %Flow46 -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 -; CHECK-NEXT: s_mov_b32 s49, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42 -; CHECK-NEXT: s_cbranch_execz .LBB0_23 -; CHECK-NEXT: ; %bb.18: ; %.preheader -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_mov_b32 s52, 0 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_7 Depth=1 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, v58, v42 +; CHECK-NEXT: s_xor_b32 s52, vcc_lo, exec_lo +; CHECK-NEXT: s_and_b32 s4, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 +; CHECK-NEXT: ; %bb.20: ; %.preheader +; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1 +; CHECK-NEXT: s_mov_b32 s53, 0 ; CHECK-NEXT: s_inst_prefetch 0x1 -; CHECK-NEXT: s_branch .LBB0_20 +; CHECK-NEXT: s_branch .LBB0_22 ; CHECK-NEXT: .p2align 6 -; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_20 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: .LBB0_21: ; in Loop: Header=BB0_22 Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 1, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v58, v42 -; CHECK-NEXT: s_or_b32 s52, vcc_lo, s52 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 -; CHECK-NEXT: s_cbranch_execz .LBB0_22 -; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1 +; CHECK-NEXT: s_or_b32 s53, vcc_lo, s53 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s53 +; CHECK-NEXT: s_and_b32 s5, s4, -1 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s53 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_5 +; CHECK-NEXT: .LBB0_22: ; Parent Loop BB0_7 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58 -; CHECK-NEXT: s_mov_b32 s53, exec_lo +; CHECK-NEXT: s_mov_b32 s54, exec_lo ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 -; CHECK-NEXT: s_cbranch_execz .LBB0_19 -; CHECK-NEXT: ; %bb.21: ; in Loop: Header=BB0_20 Depth=2 +; CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, v56, v0 +; CHECK-NEXT: s_and_b32 s4, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_21 +; CHECK-NEXT: ; %bb.23: ; in Loop: Header=BB0_22 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -322,26 +360,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v57 -; CHECK-NEXT: s_branch .LBB0_19 -; CHECK-NEXT: .LBB0_22: ; %Flow43 -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 -; CHECK-NEXT: .LBB0_23: ; %Flow44 -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49 -; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s48, v45 -; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 -; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 -; CHECK-NEXT: s_mov_b32 s49, s48 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_or_b32 s43, s4, s43 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s43 -; CHECK-NEXT: s_cbranch_execnz .LBB0_5 -; CHECK-NEXT: .LBB0_25: ; %Flow51 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s54 +; CHECK-NEXT: s_branch .LBB0_21 +; CHECK-NEXT: .LBB0_24: ; %Flow47 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s43 +; CHECK-NEXT: .LBB0_25: ; %Flow49 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s42 +; CHECK-NEXT: .LBB0_26: ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -356,16 +381,19 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_addc_u32 s7, s7, _Z7barrierj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b32 s4, exec_lo ; CHECK-NEXT: ds_read_b32 v47, v0 offset:15360 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v41 -; CHECK-NEXT: s_cbranch_execz .LBB0_33 -; CHECK-NEXT: ; %bb.26: +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, v47, v41 +; CHECK-NEXT: s_and_b32 s4, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_35 +; CHECK-NEXT: ; %bb.27: ; CHECK-NEXT: s_mov_b32 s42, 0 -; CHECK-NEXT: s_branch .LBB0_28 -; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: s_branch .LBB0_30 +; CHECK-NEXT: .LBB0_28: ; %Flow40 +; CHECK-NEXT: ; in Loop: Header=BB0_30 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s43 +; CHECK-NEXT: .LBB0_29: ; in Loop: Header=BB0_30 Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_add_u32 s8, s34, 40 @@ -382,9 +410,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41 ; CHECK-NEXT: s_or_b32 s42, vcc_lo, s42 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s42 -; CHECK-NEXT: s_cbranch_execz .LBB0_33 -; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s42 +; CHECK-NEXT: s_and_b32 s5, s4, -1 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s42 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_35 +; CHECK-NEXT: .LBB0_30: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v41 ; CHECK-NEXT: s_mov_b32 s43, exec_lo ; CHECK-NEXT: ds_read_b32 v0, v0 @@ -411,9 +441,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_xor_b32_e32 v56, v10, v6 ; CHECK-NEXT: v_or_b32_e32 v5, v46, v57 ; CHECK-NEXT: v_or_b32_e32 v4, v45, v56 -; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_27 -; CHECK-NEXT: ; %bb.29: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; CHECK-NEXT: s_and_b32 s4, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_29 +; CHECK-NEXT: ; %bb.31: ; in Loop: Header=BB0_30 Depth=1 ; CHECK-NEXT: s_clause 0x1 ; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:24 ; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:24 @@ -449,11 +481,12 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v1, v43 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_bfe_u32 v0, v0, v74, 4 -; CHECK-NEXT: s_mov_b32 s4, exec_lo -; CHECK-NEXT: v_cmpx_gt_u32_e32 12, v0 -; CHECK-NEXT: s_xor_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_cbranch_execz .LBB0_31 -; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 +; CHECK-NEXT: s_xor_b32 s4, vcc_lo, exec_lo +; CHECK-NEXT: s_and_b32 s5, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB0_33 +; CHECK-NEXT: ; %bb.32: ; in Loop: Header=BB0_30 Depth=1 ; CHECK-NEXT: v_xor_b32_e32 v4, v60, v58 ; CHECK-NEXT: v_lshrrev_b64 v[2:3], 16, v[56:57] ; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[46:47] @@ -476,11 +509,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: global_store_dword v[6:7], v8, off offset:4 ; CHECK-NEXT: global_store_dwordx4 v[6:7], v[0:3], off offset:8 ; CHECK-NEXT: global_store_dwordx2 v[6:7], v[4:5], off offset:24 -; CHECK-NEXT: .LBB0_31: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB0_28 Depth=1 -; CHECK-NEXT: s_andn2_saveexec_b32 s4, s4 -; CHECK-NEXT: s_cbranch_execz .LBB0_27 -; CHECK-NEXT: ; %bb.32: ; in Loop: Header=BB0_28 Depth=1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: .LBB0_33: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB0_30 Depth=1 +; CHECK-NEXT: s_xor_b32 s48, s4, exec_lo +; CHECK-NEXT: s_and_b32 s5, s4, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_28 +; CHECK-NEXT: ; %bb.34: ; in Loop: Header=BB0_30 Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, v42 ; CHECK-NEXT: v_mov_b32_e32 v1, v43 @@ -496,8 +532,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_subPU3AS1Vjj@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_subPU3AS1Vjj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_branch .LBB0_27 -; CHECK-NEXT: .LBB0_33: +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48 +; CHECK-NEXT: s_branch .LBB0_28 +; CHECK-NEXT: .LBB0_35: ; CHECK-NEXT: s_endpgm %6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4 %7 = trunc i64 %6 to i32 @@ -852,27 +889,46 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41 -; CHECK-NEXT: .LBB1_1: ; %.37 +; CHECK-NEXT: s_branch .LBB1_3 +; CHECK-NEXT: .LBB1_1: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; CHECK-NEXT: s_inst_prefetch 0x2 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44 +; CHECK-NEXT: .LBB1_2: ; %.32 +; CHECK-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s43, v45 +; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 +; CHECK-NEXT: s_or_b32 s42, s4, s42 +; CHECK-NEXT: s_mov_b32 s4, s43 +; CHECK-NEXT: s_andn2_b32 s5, exec_lo, s42 +; CHECK-NEXT: s_and_b32 s6, s5, -1 +; CHECK-NEXT: s_cselect_b32 exec_lo, s5, s42 +; CHECK-NEXT: s_cbranch_scc0 .LBB1_12 +; CHECK-NEXT: .LBB1_3: ; %.37 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 -; CHECK-NEXT: ; Child Loop BB1_3 Depth 2 -; CHECK-NEXT: ; Child Loop BB1_8 Depth 2 +; CHECK-NEXT: ; Child Loop BB1_5 Depth 2 +; CHECK-NEXT: ; Child Loop BB1_10 Depth 2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44 -; CHECK-NEXT: s_lshl_b32 s5, s4, 5 +; CHECK-NEXT: s_add_i32 s7, s4, 5 +; CHECK-NEXT: s_lshl_b32 s6, s4, 5 ; CHECK-NEXT: s_add_i32 s43, s4, 1 -; CHECK-NEXT: s_add_i32 s6, s4, 5 -; CHECK-NEXT: v_or3_b32 v47, s5, v42, s43 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, s7, v41 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_read_u8 v46, v0 +; CHECK-NEXT: v_or3_b32 v47, s6, v42, s43 ; CHECK-NEXT: v_mov_b32_e32 v56, s43 ; CHECK-NEXT: s_mov_b32 s5, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41 -; CHECK-NEXT: s_cbranch_execz .LBB1_5 -; CHECK-NEXT: ; %bb.2: ; %.53.preheader -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: s_and_b32 s6, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB1_7 +; CHECK-NEXT: ; %bb.4: ; %.53.preheader +; CHECK-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: .LBB1_3: ; %.53 -; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 +; CHECK-NEXT: .LBB1_5: ; %.53 +; CHECK-NEXT: ; Parent Loop BB1_3 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: s_add_i32 s7, s7, 4 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 @@ -883,44 +939,50 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41 ; CHECK-NEXT: v_mov_b32_e32 v56, s8 ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 -; CHECK-NEXT: s_cbranch_execnz .LBB1_3 -; CHECK-NEXT: ; %bb.4: ; %Flow3 -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_andn2_b32 s8, exec_lo, s6 +; CHECK-NEXT: s_and_b32 s9, s8, -1 +; CHECK-NEXT: s_cselect_b32 exec_lo, s8, s6 +; CHECK-NEXT: s_cbranch_scc1 .LBB1_5 +; CHECK-NEXT: ; %bb.6: ; %Flow3 +; CHECK-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v47, v0 -; CHECK-NEXT: .LBB1_5: ; %Flow4 -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; CHECK-NEXT: s_mov_b32 s44, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41 -; CHECK-NEXT: s_cbranch_execz .LBB1_11 -; CHECK-NEXT: ; %bb.6: ; %.103.preheader -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: .LBB1_7: ; %.48 +; CHECK-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, v56, v41 +; CHECK-NEXT: s_xor_b32 s44, vcc_lo, exec_lo +; CHECK-NEXT: s_and_b32 s4, vcc_lo, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB1_2 +; CHECK-NEXT: ; %bb.8: ; %.103.preheader +; CHECK-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; CHECK-NEXT: s_mov_b32 s45, 0 ; CHECK-NEXT: s_inst_prefetch 0x1 -; CHECK-NEXT: s_branch .LBB1_8 +; CHECK-NEXT: s_branch .LBB1_10 ; CHECK-NEXT: .p2align 6 -; CHECK-NEXT: .LBB1_7: ; %.114 -; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46 +; CHECK-NEXT: .LBB1_9: ; %.114 +; CHECK-NEXT: ; in Loop: Header=BB1_10 Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41 ; CHECK-NEXT: s_or_b32 s45, vcc_lo, s45 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s45 -; CHECK-NEXT: s_cbranch_execz .LBB1_10 -; CHECK-NEXT: .LBB1_8: ; %.103 -; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 +; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s45 +; CHECK-NEXT: s_and_b32 s5, s4, -1 +; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s45 +; CHECK-NEXT: s_cbranch_scc0 .LBB1_1 +; CHECK-NEXT: .LBB1_10: ; %.103 +; CHECK-NEXT: ; Parent Loop BB1_3 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56 +; CHECK-NEXT: s_mov_b32 s46, exec_lo ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s46, s4 -; CHECK-NEXT: s_cbranch_execz .LBB1_7 -; CHECK-NEXT: ; %bb.9: ; %.110 -; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 +; CHECK-NEXT: s_and_b32 s5, s4, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, s4 +; CHECK-NEXT: s_cbranch_scc0 .LBB1_9 +; CHECK-NEXT: ; %bb.11: ; %.110 +; CHECK-NEXT: ; in Loop: Header=BB1_10 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s36, 40 @@ -937,26 +999,9 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v47 -; CHECK-NEXT: s_branch .LBB1_7 -; CHECK-NEXT: .LBB1_10: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45 -; CHECK-NEXT: .LBB1_11: ; %Flow2 -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44 -; CHECK-NEXT: ; %bb.12: ; %.32 -; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s43, v45 -; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_or_b32 s42, s4, s42 -; CHECK-NEXT: s_mov_b32 s4, s43 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s42 -; CHECK-NEXT: s_cbranch_execnz .LBB1_1 -; CHECK-NEXT: ; %bb.13: ; %.119 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s42 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46 +; CHECK-NEXT: s_branch .LBB1_9 +; CHECK-NEXT: .LBB1_12: ; %.119 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_add_u32 s8, s36, 40 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir index 329f296712160..c4c3878a7e98b 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir @@ -30,7 +30,6 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; CHECK-NEXT: SI_RETURN bb.0: @@ -57,7 +56,6 @@ body: | S_BRANCH %bb.2 bb.2: - SI_END_CF %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec FLAT_STORE_DWORD %3, %9, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) SI_RETURN ... @@ -93,7 +91,6 @@ body: | ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; CHECK-NEXT: SI_RETURN bb.0: @@ -122,7 +119,6 @@ body: | S_BRANCH %bb.2 bb.2: - SI_END_CF %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec FLAT_STORE_DWORD %3, %11, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll index d5ef866dc8a85..89e6ed16d7665 100644 --- a/llvm/test/CodeGen/AMDGPU/madmk.ll +++ b/llvm/test/CodeGen/AMDGPU/madmk.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index 1dd18b4228fe5..ed880fd428249 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -106,24 +106,31 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: s_branch .LBB0_12 ; CHECK-NEXT: .LBB0_10: ; %Flow19 ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: .LBB0_11: ; %Flow21 ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 ; CHECK-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; CHECK-NEXT: s_cbranch_vccz .LBB0_20 +; CHECK-NEXT: s_cbranch_vccz .LBB0_21 ; CHECK-NEXT: .LBB0_12: ; %while.cond ; CHECK-NEXT: ; =>This Loop Header: Depth=1 -; CHECK-NEXT: ; Child Loop BB0_14 Depth 2 -; CHECK-NEXT: ; Child Loop BB0_18 Depth 2 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_15 -; CHECK-NEXT: ; %bb.13: ; %loop-memcpy-expansion2.preheader +; CHECK-NEXT: ; Child Loop BB0_15 Depth 2 +; CHECK-NEXT: ; Child Loop BB0_19 Depth 2 +; CHECK-NEXT: s_and_b64 s[10:11], s[4:5], exec +; CHECK-NEXT: s_mov_b64 s[8:9], exec +; CHECK-NEXT: s_and_b64 s[12:13], s[10:11], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[10:11] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_14 +; CHECK-NEXT: ; %bb.13: ; %Flow20 +; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 +; CHECK-NEXT: s_mov_b64 s[8:9], -1 +; CHECK-NEXT: s_cbranch_execz .LBB0_11 +; CHECK-NEXT: s_branch .LBB0_17 +; CHECK-NEXT: .LBB0_14: ; %loop-memcpy-expansion2.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 ; CHECK-NEXT: s_mov_b64 s[10:11], 0 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 ; CHECK-NEXT: s_mov_b64 s[14:15], 0 -; CHECK-NEXT: .LBB0_14: ; %loop-memcpy-expansion2 +; CHECK-NEXT: .LBB0_15: ; %loop-memcpy-expansion2 ; CHECK-NEXT: ; Parent Loop BB0_12 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v10, s10 @@ -152,6 +159,8 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc, s[14:15], v[4:5] ; CHECK-NEXT: s_addc_u32 s11, s11, 0 ; CHECK-NEXT: s_or_b64 s[12:13], vcc, s[12:13] +; CHECK-NEXT: s_andn2_b64 s[16:17], exec, s[12:13] +; CHECK-NEXT: s_and_b64 s[18:19], s[16:17], -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_byte v[10:11], v15 offset:3 ; CHECK-NEXT: flat_store_byte v[10:11], v16 offset:2 @@ -169,23 +178,25 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: flat_store_byte v[10:11], v21 offset:14 ; CHECK-NEXT: flat_store_byte v[10:11], v20 offset:13 ; CHECK-NEXT: flat_store_byte v[10:11], v27 offset:12 -; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] -; CHECK-NEXT: s_cbranch_execnz .LBB0_14 -; CHECK-NEXT: .LBB0_15: ; %Flow20 +; CHECK-NEXT: s_cselect_b64 exec, s[16:17], s[12:13] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_15 +; CHECK-NEXT: ; %bb.16: ; %loop.exit.guard ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], -1 ; CHECK-NEXT: s_cbranch_execz .LBB0_11 -; CHECK-NEXT: ; %bb.16: ; %loop-memcpy-residual-header5 +; CHECK-NEXT: .LBB0_17: ; %loop-memcpy-residual-header5 ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] -; CHECK-NEXT: s_xor_b64 s[10:11], exec, s[8:9] -; CHECK-NEXT: s_cbranch_execz .LBB0_10 -; CHECK-NEXT: ; %bb.17: ; %loop-memcpy-residual4.preheader +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], exec +; CHECK-NEXT: s_xor_b64 s[10:11], s[8:9], exec +; CHECK-NEXT: s_and_b64 s[12:13], s[8:9], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[8:9] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_10 +; CHECK-NEXT: ; %bb.18: ; %loop-memcpy-residual4.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 ; CHECK-NEXT: s_mov_b64 s[14:15], 0 -; CHECK-NEXT: .LBB0_18: ; %loop-memcpy-residual4 +; CHECK-NEXT: .LBB0_19: ; %loop-memcpy-residual4 ; CHECK-NEXT: ; Parent Loop BB0_12 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v12, s15 @@ -198,15 +209,17 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: v_cmp_ge_u64_e64 s[8:9], s[14:15], v[6:7] ; CHECK-NEXT: v_addc_co_u32_e32 v11, vcc, v9, v12, vcc ; CHECK-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13] +; CHECK-NEXT: s_andn2_b64 s[8:9], exec, s[12:13] +; CHECK-NEXT: s_and_b64 s[16:17], s[8:9], -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_byte v[10:11], v13 -; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] -; CHECK-NEXT: s_cbranch_execnz .LBB0_18 -; CHECK-NEXT: ; %bb.19: ; %Flow +; CHECK-NEXT: s_cselect_b64 exec, s[8:9], s[12:13] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_19 +; CHECK-NEXT: ; %bb.20: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[12:13] +; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] ; CHECK-NEXT: s_branch .LBB0_10 -; CHECK-NEXT: .LBB0_20: ; %DummyReturnBlock +; CHECK-NEXT: .LBB0_21: ; %DummyReturnBlock ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll b/llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll index fe4c2e4b488b8..57b12e4305b4b 100644 --- a/llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll +++ b/llvm/test/CodeGen/AMDGPU/mixed-wave32-wave64.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck --check-prefix=GCN %s ; GCN-LABEL: _amdgpu_hs_main: diff --git a/llvm/test/CodeGen/AMDGPU/mmra.ll b/llvm/test/CodeGen/AMDGPU/mmra.ll index d9b48f79739b6..833a194c998f2 100644 --- a/llvm/test/CodeGen/AMDGPU/mmra.ll +++ b/llvm/test/CodeGen/AMDGPU/mmra.ll @@ -92,8 +92,6 @@ define void @atomicrmw_rel(ptr %ptr) { ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: - ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 - ; CHECK-NEXT: SI_END_CF [[PHI2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: SI_RETURN %old.2 = atomicrmw add ptr %ptr, i8 0 release, !mmra !1 ret void @@ -160,22 +158,20 @@ define void @cmpxchg(ptr %ptr) { ; CHECK-NEXT: [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[S_OR_B64_]], $exec, implicit-def $scc ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc ; CHECK-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_ANDN2_B64_]], [[S_AND_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.Flow: ; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[S_OR_B64_]], %bb.1, [[S_OR_B64_1]], %bb.2 ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[COPY7]], %bb.1, [[V_AND_B32_e64_3]], %bb.2 - ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[PHI3]] ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK [[COPY8]], [[PHI1]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4.partword.cmpxchg.end: - ; CHECK-NEXT: [[PHI5:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.3 - ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[FLAT_ATOMIC_CMPSWAP_RTN]], %bb.3 - ; CHECK-NEXT: SI_END_CF [[PHI5]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[FLAT_ATOMIC_CMPSWAP_RTN]], %bb.3 ; CHECK-NEXT: SI_RETURN %pair = cmpxchg ptr %ptr, i8 0, i8 1 acquire acquire, !mmra !2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll index 4332d9daeaaf5..25f9a57387294 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll @@ -23,34 +23,35 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_4 +; GCN-NEXT: s_and_b64 s[2:3], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400 +; GCN-NEXT: buffer_load_dword v5, v[1:2], s[8:11], 0 addr64 offset:400 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xf ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_max_i32_e32 v3, s2, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v4 -; GCN-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NEXT: buffer_atomic_cmpswap v[5:6], v[1:2], s[8:11], 0 addr64 offset:400 glc +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_max_i32_e32 v4, s2, v5 +; GCN-NEXT: v_mov_b32_e32 v3, v4 +; GCN-NEXT: v_mov_b32_e32 v4, v5 +; GCN-NEXT: buffer_atomic_cmpswap v[3:4], v[1:2], s[8:11], 0 addr64 offset:400 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GCN-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, v5 -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB0_2 +; GCN-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GCN-NEXT: s_and_b64 s[12:13], s[6:7], -1 +; GCN-NEXT: v_mov_b32_e32 v5, v3 +; GCN-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.3: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: buffer_store_dword v5, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v3, off, s[4:7], 0 ; GCN-NEXT: .LBB0_4: ; %exit ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -82,8 +83,9 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[4:7], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_3 +; GCN-NEXT: s_and_b64 s[2:3], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_3 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 @@ -102,9 +104,11 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GCN-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN-NEXT: s_andn2_b64 s[8:9], exec, s[0:1] +; GCN-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; GCN-NEXT: v_mov_b32_e32 v4, v5 -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN-NEXT: s_cbranch_execnz .LBB1_2 +; GCN-NEXT: s_cselect_b64 exec, s[8:9], s[0:1] +; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: .LBB1_3: ; %exit ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll index 63688ebeab9d0..59ae79bf326e5 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll @@ -23,8 +23,9 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: s_and_b64 s[2:3], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_load_dword s0, s[0:1], 0xf ; GCN-NEXT: s_mov_b32 s8, s10 @@ -67,8 +68,9 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[4:7], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: s_and_b64 s[2:3], vcc, -1 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_load_dword s0, s[0:1], 0xf ; GCN-NEXT: s_mov_b32 s4, s6 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll index 003c3ea7fce10..1981f05a0bf7f 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll @@ -60,11 +60,10 @@ define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) { ; GFX11-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[IMAGE_LOAD_V1_V2_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 [[REG_SEQUENCE1]], killed [[REG_SEQUENCE6]], 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX11-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.1, implicit $exec ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.3: - ; GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_1]] ; GFX11-NEXT: $vgpr0 = COPY [[IMAGE_LOAD_V1_V2_gfx11_]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -122,11 +121,10 @@ define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) { ; GFX12-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[IMAGE_LOAD_V1_V2_gfx12_:%[0-9]+]]:vgpr_32 = IMAGE_LOAD_V1_V2_gfx12 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE5]], 1, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.1, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.3: - ; GFX12-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; GFX12-NEXT: $vgpr0 = COPY [[IMAGE_LOAD_V1_V2_gfx12_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 bb: @@ -194,11 +192,10 @@ define amdgpu_ps float @vsample_move_to_valu_rsrc(<8 x i32> %rsrc, <4 x i32> inr ; GFX11-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[IMAGE_SAMPLE_V1_V1_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V1_gfx11 [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE6]], [[REG_SEQUENCE1]], 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX11-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.1, implicit $exec ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.3: - ; GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; GFX11-NEXT: $vgpr0 = COPY [[IMAGE_SAMPLE_V1_V1_gfx11_]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -261,11 +258,10 @@ define amdgpu_ps float @vsample_move_to_valu_rsrc(<8 x i32> %rsrc, <4 x i32> inr ; GFX12-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[IMAGE_SAMPLE_V1_V1_gfx12_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V1_gfx12 [[V_MOV_B32_e32_]], killed [[REG_SEQUENCE6]], [[REG_SEQUENCE1]], 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.1, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.3: - ; GFX12-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; GFX12-NEXT: $vgpr0 = COPY [[IMAGE_SAMPLE_V1_V1_gfx12_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 main_body: @@ -319,11 +315,10 @@ define amdgpu_ps float @vsample_move_to_valu_samp(<8 x i32> inreg %rsrc, <4 x i3 ; GFX11-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[IMAGE_SAMPLE_V1_V1_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V1_gfx11 [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], killed [[REG_SEQUENCE4]], 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; GFX11-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX11-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX11-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX11-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.1, implicit $exec ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: bb.3: - ; GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; GFX11-NEXT: $vgpr0 = COPY [[IMAGE_SAMPLE_V1_V1_gfx11_]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -372,11 +367,10 @@ define amdgpu_ps float @vsample_move_to_valu_samp(<8 x i32> inreg %rsrc, <4 x i3 ; GFX12-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[IMAGE_SAMPLE_V1_V1_gfx12_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V1_gfx12 [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], killed [[REG_SEQUENCE4]], 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; GFX12-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; GFX12-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.1, implicit $exec ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: bb.3: - ; GFX12-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; GFX12-NEXT: $vgpr0 = COPY [[IMAGE_SAMPLE_V1_V1_gfx12_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 main_body: diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll index b5ee6689f8dc3..a4b5c3961082a 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll @@ -13,7 +13,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX9_W64-LABEL: mubuf_vgpr: ; GFX9_W64: ; %bb.0: ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX9_W64-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -25,12 +24,13 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9_W64-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9_W64-NEXT: ; implicit-def: $vgpr4 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9_W64-NEXT: ; %bb.2: -; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) ; GFX9_W64-NEXT: v_mov_b32_e32 v0, v5 ; GFX9_W64-NEXT: s_setpc_b64 s[30:31] @@ -38,7 +38,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1010_W32-LABEL: mubuf_vgpr: ; GFX1010_W32: ; %bb.0: ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo ; GFX1010_W32-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v1 @@ -49,13 +48,14 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s5, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4 +; GFX1010_W32-NEXT: s_and_b32 s6, s5, -1 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1010_W32-NEXT: ; %bb.2: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W32-NEXT: v_mov_b32_e32 v0, v5 ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31] @@ -63,7 +63,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1010_W64-LABEL: mubuf_vgpr: ; GFX1010_W64: ; %bb.0: ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX1010_W64-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -74,13 +73,14 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1010_W64-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1010_W64-NEXT: ; %bb.2: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W64-NEXT: v_mov_b32_e32 v0, v5 ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31] @@ -88,7 +88,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1100_W32-LABEL: mubuf_vgpr: ; GFX1100_W32: ; %bb.0: ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v1 @@ -101,12 +100,14 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_and_b32 s2, s1, -1 +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1100_W32-NEXT: ; %bb.2: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W32-NEXT: v_mov_b32_e32 v0, v5 ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31] @@ -114,7 +115,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1100_W64-LABEL: mubuf_vgpr: ; GFX1100_W64: ; %bb.0: ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v1 @@ -127,12 +127,14 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[2:3], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1100_W64-NEXT: ; %bb.2: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W64-NEXT: v_mov_b32_e32 v0, v5 ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31] @@ -169,12 +171,10 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b32 s4, 0 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload @@ -199,13 +199,13 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 1 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 2 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 3 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 4 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 5 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 6 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] @@ -214,29 +214,27 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 5 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 6 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 1 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 2 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 3 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 4 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 0 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s4 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB0_1 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; W64-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB0_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 2 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 @@ -259,7 +257,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX9_W64-LABEL: mubuf_vgpr_adjacent_in_block: ; GFX9_W64: ; %bb.0: ; %entry ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX9_W64-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -271,13 +268,12 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9_W64-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB1_1 -; GFX9_W64-NEXT: ; %bb.2: -; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] -; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec -; GFX9_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9_W64-NEXT: .LBB1_2: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v4 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v5 ; GFX9_W64-NEXT: v_readfirstlane_b32 s10, v6 @@ -288,12 +284,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9_W64-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9_W64-NEXT: ; implicit-def: $vgpr8 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB1_3 -; GFX9_W64-NEXT: ; %bb.4: -; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] +; GFX9_W64-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX9_W64-NEXT: ; %bb.3: ; GFX9_W64-NEXT: s_waitcnt vmcnt(1) ; GFX9_W64-NEXT: global_store_dword v[9:10], v13, off ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) @@ -304,7 +301,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1010_W32-LABEL: mubuf_vgpr_adjacent_in_block: ; GFX1010_W32: ; %bb.0: ; %entry ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo ; GFX1010_W32-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v1 @@ -315,14 +311,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s5, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX1010_W32-NEXT: s_and_b32 s6, s5, -1 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1010_W32-NEXT: ; %bb.2: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 -; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo -; GFX1010_W32-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1010_W32-NEXT: .LBB1_2: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v4 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v5 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s10, v6 @@ -332,13 +327,14 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s5, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 +; GFX1010_W32-NEXT: s_and_b32 s6, s5, -1 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_3 -; GFX1010_W32-NEXT: ; %bb.4: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX1010_W32-NEXT: ; %bb.3: ; GFX1010_W32-NEXT: s_waitcnt vmcnt(1) ; GFX1010_W32-NEXT: global_store_dword v[9:10], v13, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -350,7 +346,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1010_W64-LABEL: mubuf_vgpr_adjacent_in_block: ; GFX1010_W64: ; %bb.0: ; %entry ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX1010_W64-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -361,14 +356,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1010_W64-NEXT: ; %bb.2: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] -; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec -; GFX1010_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 +; GFX1010_W64-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1010_W64-NEXT: .LBB1_2: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v4 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v5 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s10, v6 @@ -378,13 +372,14 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB1_3 -; GFX1010_W64-NEXT: ; %bb.4: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] +; GFX1010_W64-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX1010_W64-NEXT: ; %bb.3: ; GFX1010_W64-NEXT: s_waitcnt vmcnt(1) ; GFX1010_W64-NEXT: global_store_dword v[9:10], v13, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -396,7 +391,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W32-LABEL: mubuf_vgpr_adjacent_in_block: ; GFX1100_W32: ; %bb.0: ; %entry ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v1 @@ -409,14 +403,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1100_W32-NEXT: ; %bb.2: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo -; GFX1100_W32-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 +; GFX1100_W32-NEXT: s_and_b32 s2, s1, -1 +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1100_W32-NEXT: .LBB1_2: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v4 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v5 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s6, v6 @@ -428,12 +421,14 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB1_3 -; GFX1100_W32-NEXT: ; %bb.4: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_and_b32 s2, s1, -1 +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX1100_W32-NEXT: ; %bb.3: ; GFX1100_W32-NEXT: s_waitcnt vmcnt(1) ; GFX1100_W32-NEXT: global_store_b32 v[9:10], v13, off dlc ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -445,7 +440,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W64-LABEL: mubuf_vgpr_adjacent_in_block: ; GFX1100_W64: ; %bb.0: ; %entry ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v1 @@ -458,14 +452,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[2:3], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1100_W64-NEXT: ; %bb.2: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec -; GFX1100_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 +; GFX1100_W64-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1100_W64-NEXT: .LBB1_2: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v4 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v5 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s6, v6 @@ -477,12 +470,14 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[2:3], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB1_3 -; GFX1100_W64-NEXT: ; %bb.4: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX1100_W64-NEXT: ; %bb.3: ; GFX1100_W64-NEXT: s_waitcnt vmcnt(1) ; GFX1100_W64-NEXT: global_store_b32 v[9:10], v13, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -561,12 +556,10 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b32 s4, 0 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload @@ -591,13 +584,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 1 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 2 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 3 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 4 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 5 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 6 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] @@ -606,35 +599,25 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 5 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 6 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 1 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 2 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 3 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 4 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 0 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s4 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; W64-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB1_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 2 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 9 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 10 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB1_4: ; =>This Inner Loop Header: Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload @@ -659,13 +642,13 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 11 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 12 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 13 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 14 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 7 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 9 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 10 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 15 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 16 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 11 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 12 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] @@ -674,29 +657,27 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 15 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 16 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 11 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 12 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 11 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 12 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 7 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 8 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 9 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 10 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 0 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s4 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; W64-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB1_4 ; W64-O0-NEXT: ; %bb.6: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 10 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -733,8 +714,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX9_W64-NEXT: ;;#ASMSTART ; GFX9_W64-NEXT: s_mov_b32 s4, 17 ; GFX9_W64-NEXT: ;;#ASMEND -; GFX9_W64-NEXT: v_mov_b32_e32 v8, s4 -; GFX9_W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9_W64-NEXT: v_mov_b32_e32 v9, s4 ; GFX9_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -745,20 +725,22 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX9_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] ; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX9_W64-NEXT: s_nop 0 -; GFX9_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GFX9_W64-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX9_W64-NEXT: ; implicit-def: $vgpr8 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[6:7] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9_W64-NEXT: ; implicit-def: $vgpr9 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_W64-NEXT: ; %bb.2: -; GFX9_W64-NEXT: s_mov_b64 exec, s[12:13] ; GFX9_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX9_W64-NEXT: s_cbranch_execz .LBB2_6 +; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec +; GFX9_W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX9_W64-NEXT: s_cmov_b64 exec, vcc +; GFX9_W64-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX9_W64-NEXT: ; %bb.3: ; %bb1 ; GFX9_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX9_W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9_W64-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v4 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v5 @@ -769,17 +751,18 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX9_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 -; GFX9_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX9_W64-NEXT: buffer_load_format_x v8, v0, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GFX9_W64-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_4 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX9_W64-NEXT: ; %bb.5: -; GFX9_W64-NEXT: s_mov_b64 exec, s[12:13] -; GFX9_W64-NEXT: .LBB2_6: ; %bb2 ; GFX9_W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9_W64-NEXT: .LBB2_6: ; %bb2 ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) -; GFX9_W64-NEXT: global_store_dword v[11:12], v9, off +; GFX9_W64-NEXT: global_store_dword v[11:12], v8, off ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) ; GFX9_W64-NEXT: s_setpc_b64 s[30:31] ; @@ -790,7 +773,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W32-NEXT: s_mov_b32 s4, 17 ; GFX1010_W32-NEXT: ;;#ASMEND ; GFX1010_W32-NEXT: v_mov_b32_e32 v8, s4 -; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo ; GFX1010_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v1 @@ -801,20 +783,22 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W32-NEXT: s_and_b32 s5, vcc_lo, s5 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, s5 ; GFX1010_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s6, exec_lo, s5 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 +; GFX1010_W32-NEXT: s_and_b32 s7, s6, -1 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s5 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s6, s5 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1010_W32-NEXT: ; %bb.2: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s6 ; GFX1010_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo ; GFX1010_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1010_W32-NEXT: s_cbranch_execz .LBB2_6 +; GFX1010_W32-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1010_W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1010_W32-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1010_W32-NEXT: ; %bb.3: ; %bb1 ; GFX1010_W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo ; GFX1010_W32-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v4 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v5 @@ -825,15 +809,16 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s6, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0 +; GFX1010_W32-NEXT: s_and_b32 s7, s6, -1 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_4 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s6, s4 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX1010_W32-NEXT: ; %bb.5: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s6 -; GFX1010_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1010_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W32-NEXT: global_store_dword v[11:12], v9, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -846,7 +831,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W64-NEXT: s_mov_b32 s4, 17 ; GFX1010_W64-NEXT: ;;#ASMEND ; GFX1010_W64-NEXT: v_mov_b32_e32 v8, s4 -; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec ; GFX1010_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -857,20 +841,22 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX1010_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010_W64-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 -; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[6:7] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1010_W64-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1010_W64-NEXT: ; %bb.2: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[12:13] ; GFX1010_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX1010_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1010_W64-NEXT: s_cbranch_execz .LBB2_6 +; GFX1010_W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1010_W64-NEXT: s_cmov_b64 exec, vcc +; GFX1010_W64-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1010_W64-NEXT: ; %bb.3: ; %bb1 ; GFX1010_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec ; GFX1010_W64-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v4 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v5 @@ -881,15 +867,16 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010_W64-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0 -; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_4 +; GFX1010_W64-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX1010_W64-NEXT: ; %bb.5: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[12:13] -; GFX1010_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1010_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W64-NEXT: global_store_dword v[11:12], v9, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -902,7 +889,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W32-NEXT: s_mov_b32 s4, 17 ; GFX1100_W32-NEXT: ;;#ASMEND ; GFX1100_W32-NEXT: v_mov_b32_e32 v8, s4 -; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s9, v1 @@ -915,20 +901,23 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_and_b32 s2, s1, -1 +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1100_W32-NEXT: ; %bb.2: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100_W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1100_W32-NEXT: s_cbranch_execz .LBB2_6 +; GFX1100_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1100_W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1100_W32-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1100_W32-NEXT: ; %bb.3: ; %bb1 ; GFX1100_W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100_W32-NEXT: s_mov_b32 s2, exec_lo ; GFX1100_W32-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v4 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v5 @@ -941,15 +930,16 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s2, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_4 -; GFX1100_W32-NEXT: ; %bb.5: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX1100_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_and_b32 s3, s2, -1 +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s2, s0 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1100_W32-NEXT: ; %bb.5: ; GFX1100_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1100_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W32-NEXT: global_store_b32 v[11:12], v9, off dlc ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -962,7 +952,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W64-NEXT: s_mov_b32 s4, 17 ; GFX1100_W64-NEXT: ;;#ASMEND ; GFX1100_W64-NEXT: v_mov_b32_e32 v8, s4 -; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -975,20 +964,23 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[2:3], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_and_b64 s[6:7], s[2:3], -1 +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1100_W64-NEXT: ; %bb.2: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100_W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1100_W64-NEXT: s_cbranch_execz .LBB2_6 +; GFX1100_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1100_W64-NEXT: s_cmov_b64 exec, vcc +; GFX1100_W64-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1100_W64-NEXT: ; %bb.3: ; %bb1 ; GFX1100_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100_W64-NEXT: s_mov_b64 s[8:9], exec ; GFX1100_W64-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v4 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v5 @@ -1001,15 +993,16 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[4:5], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_4 -; GFX1100_W64-NEXT: ; %bb.5: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[8:9] -; GFX1100_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1100_W64-NEXT: ; %bb.5: ; GFX1100_W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1100_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W64-NEXT: global_store_b32 v[11:12], v9, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1036,7 +1029,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: v_mov_b32_e32 v6, v0 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 @@ -1079,17 +1072,15 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: v_writelane_b32 v0, s5, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 1 -; W64-O0-NEXT: v_mov_b32_e32 v1, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 2 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 3 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_mov_b32_e32 v0, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload @@ -1111,81 +1102,79 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 6 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 7 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 2 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 3 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 4 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 5 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 8 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 9 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 6 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 7 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB2_1 Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 9 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s6, v1, 6 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 7 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 2 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 3 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 4 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 5 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s4 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; W64-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB2_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s6, v0, 2 -; W64-O0-NEXT: v_readlane_b32 s7, v0, 3 -; W64-O0-NEXT: s_mov_b64 exec, s[6:7] ; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_and_b32_e64 v2, v2, s5 -; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 +; W64-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, s4 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 10 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 11 +; W64-O0-NEXT: s_mov_b64 s[6:7], exec +; W64-O0-NEXT: v_writelane_b32 v0, s6, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s7, 9 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execz .LBB2_8 -; W64-O0-NEXT: ; %bb.4: ; %bb1 +; W64-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; W64-O0-NEXT: s_cmov_b64 exec, s[4:5] +; W64-O0-NEXT: s_cbranch_scc1 .LBB2_4 +; W64-O0-NEXT: s_branch .LBB2_8 +; W64-O0-NEXT: .LBB2_4: ; %bb1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_readlane_b32 s4, v0, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 12 -; W64-O0-NEXT: v_mov_b32_e32 v1, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 13 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 14 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 10 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_mov_b32_e32 v0, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: .LBB2_5: ; =>This Inner Loop Header: Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -1207,55 +1196,53 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 15 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 16 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 17 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 18 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 11 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 12 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 13 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 14 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 19 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 20 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 15 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 16 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 19 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 20 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 15 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 16 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 17 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 18 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 12 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 15 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 16 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 11 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 12 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 10 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s4 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; W64-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB2_5 ; W64-O0-NEXT: ; %bb.7: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 14 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] +; W64-O0-NEXT: v_readlane_b32 s4, v1, 8 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 9 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: .LBB2_8: ; %bb2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 11 -; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 162c47f879465..866db1aa06dac 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -12,7 +12,6 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX9_W64-LABEL: mubuf_vgpr: ; GFX9_W64: ; %bb.0: ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX9_W64-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -24,12 +23,13 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9_W64-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9_W64-NEXT: ; implicit-def: $vgpr4 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9_W64-NEXT: ; %bb.2: -; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) ; GFX9_W64-NEXT: v_mov_b32_e32 v0, v5 ; GFX9_W64-NEXT: s_setpc_b64 s[30:31] @@ -37,7 +37,6 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1010_W32-LABEL: mubuf_vgpr: ; GFX1010_W32: ; %bb.0: ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo ; GFX1010_W32-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v1 @@ -48,13 +47,14 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s5, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4 +; GFX1010_W32-NEXT: s_and_b32 s6, s5, -1 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1010_W32-NEXT: ; %bb.2: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W32-NEXT: v_mov_b32_e32 v0, v5 ; GFX1010_W32-NEXT: s_setpc_b64 s[30:31] @@ -62,7 +62,6 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1010_W64-LABEL: mubuf_vgpr: ; GFX1010_W64: ; %bb.0: ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX1010_W64-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -73,13 +72,14 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v5, v4, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1010_W64-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1010_W64-NEXT: ; %bb.2: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W64-NEXT: v_mov_b32_e32 v0, v5 ; GFX1010_W64-NEXT: s_setpc_b64 s[30:31] @@ -87,7 +87,6 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1100_W32-LABEL: mubuf_vgpr: ; GFX1100_W32: ; %bb.0: ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v1 @@ -100,12 +99,14 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_and_b32 s2, s1, -1 +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1100_W32-NEXT: ; %bb.2: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W32-NEXT: v_mov_b32_e32 v0, v5 ; GFX1100_W32-NEXT: s_setpc_b64 s[30:31] @@ -113,7 +114,6 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1100_W64-LABEL: mubuf_vgpr: ; GFX1100_W64: ; %bb.0: ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v1 @@ -126,12 +126,14 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[2:3], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX1100_W64-NEXT: ; %bb.2: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W64-NEXT: v_mov_b32_e32 v0, v5 ; GFX1100_W64-NEXT: s_setpc_b64 s[30:31] @@ -183,12 +185,10 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: s_mov_b32 s4, 0 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload @@ -213,13 +213,13 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 1 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 2 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 3 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 4 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 5 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 6 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] @@ -228,29 +228,27 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 5 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 6 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 1 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 2 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 3 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 4 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 0 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s4 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB0_1 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; W64-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB0_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 2 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 @@ -273,7 +271,6 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX9_W64-LABEL: mubuf_vgpr_adjacent_in_block: ; GFX9_W64: ; %bb.0: ; %entry ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX9_W64-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -285,13 +282,12 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9_W64-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB1_1 -; GFX9_W64-NEXT: ; %bb.2: -; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] -; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec -; GFX9_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9_W64-NEXT: .LBB1_2: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v4 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v5 ; GFX9_W64-NEXT: v_readfirstlane_b32 s10, v6 @@ -302,12 +298,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 ; GFX9_W64-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9_W64-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9_W64-NEXT: ; implicit-def: $vgpr8 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB1_3 -; GFX9_W64-NEXT: ; %bb.4: -; GFX9_W64-NEXT: s_mov_b64 exec, s[6:7] +; GFX9_W64-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX9_W64-NEXT: ; %bb.3: ; GFX9_W64-NEXT: s_waitcnt vmcnt(1) ; GFX9_W64-NEXT: global_store_dword v[9:10], v13, off ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) @@ -318,7 +315,6 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1010_W32-LABEL: mubuf_vgpr_adjacent_in_block: ; GFX1010_W32: ; %bb.0: ; %entry ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo ; GFX1010_W32-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v1 @@ -329,14 +325,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s5, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX1010_W32-NEXT: s_and_b32 s6, s5, -1 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1010_W32-NEXT: ; %bb.2: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 -; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo -; GFX1010_W32-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1010_W32-NEXT: .LBB1_2: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v4 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v5 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s10, v6 @@ -346,13 +341,14 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s5, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 +; GFX1010_W32-NEXT: s_and_b32 s6, s5, -1 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB1_3 -; GFX1010_W32-NEXT: ; %bb.4: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s5 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX1010_W32-NEXT: ; %bb.3: ; GFX1010_W32-NEXT: s_waitcnt vmcnt(1) ; GFX1010_W32-NEXT: global_store_dword v[9:10], v13, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -364,7 +360,6 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1010_W64-LABEL: mubuf_vgpr_adjacent_in_block: ; GFX1010_W64: ; %bb.0: ; %entry ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX1010_W64-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -375,14 +370,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v13, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1010_W64-NEXT: ; %bb.2: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] -; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec -; GFX1010_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 +; GFX1010_W64-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1010_W64-NEXT: .LBB1_2: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v4 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v5 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s10, v6 @@ -392,13 +386,14 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v0, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 ; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB1_3 -; GFX1010_W64-NEXT: ; %bb.4: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[6:7] +; GFX1010_W64-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX1010_W64-NEXT: ; %bb.3: ; GFX1010_W64-NEXT: s_waitcnt vmcnt(1) ; GFX1010_W64-NEXT: global_store_dword v[9:10], v13, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -410,7 +405,6 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W32-LABEL: mubuf_vgpr_adjacent_in_block: ; GFX1100_W32: ; %bb.0: ; %entry ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v1 @@ -423,14 +417,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1100_W32-NEXT: ; %bb.2: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo -; GFX1100_W32-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 +; GFX1100_W32-NEXT: s_and_b32 s2, s1, -1 +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1100_W32-NEXT: .LBB1_2: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v4 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v5 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s6, v6 @@ -442,12 +435,14 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB1_3 -; GFX1100_W32-NEXT: ; %bb.4: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_and_b32 s2, s1, -1 +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX1100_W32-NEXT: ; %bb.3: ; GFX1100_W32-NEXT: s_waitcnt vmcnt(1) ; GFX1100_W32-NEXT: global_store_b32 v[9:10], v13, off dlc ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -459,7 +454,6 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W64-LABEL: mubuf_vgpr_adjacent_in_block: ; GFX1100_W64: ; %bb.0: ; %entry ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v1 @@ -472,14 +466,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[2:3], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB1_1 -; GFX1100_W64-NEXT: ; %bb.2: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec -; GFX1100_W64-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 +; GFX1100_W64-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1100_W64-NEXT: .LBB1_2: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v4 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v5 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s6, v6 @@ -491,12 +484,14 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[2:3], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB1_3 -; GFX1100_W64-NEXT: ; %bb.4: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_and_b64 s[4:5], s[2:3], -1 +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX1100_W64-NEXT: ; %bb.3: ; GFX1100_W64-NEXT: s_waitcnt vmcnt(1) ; GFX1100_W64-NEXT: global_store_b32 v[9:10], v13, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -600,12 +595,10 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: s_mov_b32 s4, 0 ; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload @@ -630,13 +623,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 1 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 2 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 3 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 4 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 5 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 6 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] @@ -645,35 +638,25 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 5 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 6 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 1 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 2 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 3 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 4 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 0 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s4 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; W64-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB1_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 2 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 9 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 10 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB1_4: ; =>This Inner Loop Header: Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload @@ -698,13 +681,13 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 11 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 12 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 13 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 14 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 7 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 9 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 10 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 15 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 16 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 11 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 12 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] @@ -713,29 +696,27 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 15 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 16 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 11 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 12 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 11 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 12 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 7 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 8 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 9 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 10 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 0 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s4 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; W64-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB1_4 ; W64-O0-NEXT: ; %bb.6: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 10 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -772,8 +753,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX9_W64-NEXT: ;;#ASMSTART ; GFX9_W64-NEXT: s_mov_b32 s4, 17 ; GFX9_W64-NEXT: ;;#ASMEND -; GFX9_W64-NEXT: v_mov_b32_e32 v8, s4 -; GFX9_W64-NEXT: s_mov_b64 s[12:13], exec +; GFX9_W64-NEXT: v_mov_b32_e32 v9, s4 ; GFX9_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -784,20 +764,22 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX9_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] ; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX9_W64-NEXT: s_nop 0 -; GFX9_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX9_W64-NEXT: buffer_load_format_x v8, v9, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GFX9_W64-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX9_W64-NEXT: ; implicit-def: $vgpr8 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[6:7] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9_W64-NEXT: ; implicit-def: $vgpr9 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_W64-NEXT: ; %bb.2: -; GFX9_W64-NEXT: s_mov_b64 exec, s[12:13] ; GFX9_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX9_W64-NEXT: s_cbranch_execz .LBB2_6 +; GFX9_W64-NEXT: s_mov_b64 s[6:7], exec +; GFX9_W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX9_W64-NEXT: s_cmov_b64 exec, vcc +; GFX9_W64-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX9_W64-NEXT: ; %bb.3: ; %bb1 ; GFX9_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX9_W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9_W64-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 ; GFX9_W64-NEXT: v_readfirstlane_b32 s8, v4 ; GFX9_W64-NEXT: v_readfirstlane_b32 s9, v5 @@ -808,17 +790,18 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX9_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX9_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX9_W64-NEXT: s_nop 0 -; GFX9_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX9_W64-NEXT: buffer_load_format_x v8, v0, s[8:11], 0 idxen +; GFX9_W64-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GFX9_W64-NEXT: s_and_b64 s[10:11], s[8:9], -1 ; GFX9_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9_W64-NEXT: ; implicit-def: $vgpr0 -; GFX9_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9_W64-NEXT: s_cbranch_execnz .LBB2_4 +; GFX9_W64-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX9_W64-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX9_W64-NEXT: ; %bb.5: -; GFX9_W64-NEXT: s_mov_b64 exec, s[12:13] -; GFX9_W64-NEXT: .LBB2_6: ; %bb2 ; GFX9_W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9_W64-NEXT: .LBB2_6: ; %bb2 ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) -; GFX9_W64-NEXT: global_store_dword v[11:12], v9, off +; GFX9_W64-NEXT: global_store_dword v[11:12], v8, off ; GFX9_W64-NEXT: s_waitcnt vmcnt(0) ; GFX9_W64-NEXT: s_setpc_b64 s[30:31] ; @@ -829,7 +812,6 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W32-NEXT: s_mov_b32 s4, 17 ; GFX1010_W32-NEXT: ;;#ASMEND ; GFX1010_W32-NEXT: v_mov_b32_e32 v8, s4 -; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo ; GFX1010_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v1 @@ -840,20 +822,22 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W32-NEXT: s_and_b32 s5, vcc_lo, s5 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, s5 ; GFX1010_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s6, exec_lo, s5 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr8 +; GFX1010_W32-NEXT: s_and_b32 s7, s6, -1 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s5 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s6, s5 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1010_W32-NEXT: ; %bb.2: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s6 ; GFX1010_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX1010_W32-NEXT: s_mov_b32 s5, exec_lo ; GFX1010_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1010_W32-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1010_W32-NEXT: s_cbranch_execz .LBB2_6 +; GFX1010_W32-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1010_W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1010_W32-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1010_W32-NEXT: ; %bb.3: ; %bb1 ; GFX1010_W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX1010_W32-NEXT: s_mov_b32 s6, exec_lo ; GFX1010_W32-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s8, v4 ; GFX1010_W32-NEXT: v_readfirstlane_b32 s9, v5 @@ -864,15 +848,16 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W32-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX1010_W32-NEXT: s_and_saveexec_b32 s4, s4 ; GFX1010_W32-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W32-NEXT: s_xor_b32 s6, exec_lo, s4 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W32-NEXT: ; implicit-def: $vgpr0 +; GFX1010_W32-NEXT: s_and_b32 s7, s6, -1 ; GFX1010_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX1010_W32-NEXT: s_cbranch_execnz .LBB2_4 +; GFX1010_W32-NEXT: s_cselect_b32 exec_lo, s6, s4 +; GFX1010_W32-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX1010_W32-NEXT: ; %bb.5: -; GFX1010_W32-NEXT: s_mov_b32 exec_lo, s6 -; GFX1010_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1010_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W32-NEXT: global_store_dword v[11:12], v9, off ; GFX1010_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -885,7 +870,6 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W64-NEXT: s_mov_b32 s4, 17 ; GFX1010_W64-NEXT: ;;#ASMEND ; GFX1010_W64-NEXT: v_mov_b32_e32 v8, s4 -; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec ; GFX1010_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -896,20 +880,22 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W64-NEXT: s_and_b64 s[6:7], vcc, s[6:7] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX1010_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010_W64-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr8 -; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[6:7] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1010_W64-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[8:9], s[6:7] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1010_W64-NEXT: ; %bb.2: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[12:13] ; GFX1010_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX1010_W64-NEXT: s_mov_b64 s[6:7], exec ; GFX1010_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1010_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1010_W64-NEXT: s_cbranch_execz .LBB2_6 +; GFX1010_W64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX1010_W64-NEXT: s_cmov_b64 exec, vcc +; GFX1010_W64-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1010_W64-NEXT: ; %bb.3: ; %bb1 ; GFX1010_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX1010_W64-NEXT: s_mov_b64 s[12:13], exec ; GFX1010_W64-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s8, v4 ; GFX1010_W64-NEXT: v_readfirstlane_b32 s9, v5 @@ -920,15 +906,16 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1010_W64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX1010_W64-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX1010_W64-NEXT: buffer_load_format_x v9, v0, s[8:11], 0 idxen +; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010_W64-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GFX1010_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1010_W64-NEXT: ; implicit-def: $vgpr0 -; GFX1010_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010_W64-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX1010_W64-NEXT: s_cbranch_execnz .LBB2_4 +; GFX1010_W64-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX1010_W64-NEXT: s_cselect_b64 exec, s[8:9], s[4:5] +; GFX1010_W64-NEXT: s_cbranch_scc1 .LBB2_4 ; GFX1010_W64-NEXT: ; %bb.5: -; GFX1010_W64-NEXT: s_mov_b64 exec, s[12:13] -; GFX1010_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1010_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1010_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1010_W64-NEXT: global_store_dword v[11:12], v9, off ; GFX1010_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -941,7 +928,6 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W32-NEXT: s_mov_b32 s4, 17 ; GFX1100_W32-NEXT: ;;#ASMEND ; GFX1100_W32-NEXT: v_mov_b32_e32 v8, s4 -; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s9, v1 @@ -954,20 +940,23 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_and_b32 s2, s1, -1 +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1100_W32-NEXT: ; %bb.2: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX1100_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX1100_W32-NEXT: s_mov_b32 s1, exec_lo ; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100_W32-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1100_W32-NEXT: s_cbranch_execz .LBB2_6 +; GFX1100_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX1100_W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1100_W32-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1100_W32-NEXT: ; %bb.3: ; %bb1 ; GFX1100_W32-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100_W32-NEXT: s_mov_b32 s2, exec_lo ; GFX1100_W32-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s4, v4 ; GFX1100_W32-NEXT: v_readfirstlane_b32 s5, v5 @@ -980,15 +969,16 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W32-NEXT: s_xor_b32 s2, exec_lo, s0 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W32-NEXT: ; implicit-def: $vgpr0 -; GFX1100_W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1100_W32-NEXT: s_cbranch_execnz .LBB2_4 -; GFX1100_W32-NEXT: ; %bb.5: -; GFX1100_W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX1100_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W32-NEXT: s_and_b32 s3, s2, -1 +; GFX1100_W32-NEXT: s_cselect_b32 exec_lo, s2, s0 +; GFX1100_W32-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1100_W32-NEXT: ; %bb.5: ; GFX1100_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1100_W32-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W32-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W32-NEXT: global_store_b32 v[11:12], v9, off dlc ; GFX1100_W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1001,7 +991,6 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W64-NEXT: s_mov_b32 s4, 17 ; GFX1100_W64-NEXT: ;;#ASMEND ; GFX1100_W64-NEXT: v_mov_b32_e32 v8, s4 -; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s9, v1 @@ -1014,20 +1003,23 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[2:3], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr8 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_and_b64 s[6:7], s[2:3], -1 +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1100_W64-NEXT: ; %bb.2: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX1100_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX1100_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100_W64-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX1100_W64-NEXT: s_cbranch_execz .LBB2_6 +; GFX1100_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX1100_W64-NEXT: s_cmov_b64 exec, vcc +; GFX1100_W64-NEXT: s_cbranch_scc0 .LBB2_6 ; GFX1100_W64-NEXT: ; %bb.3: ; %bb1 ; GFX1100_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100_W64-NEXT: s_mov_b64 s[8:9], exec ; GFX1100_W64-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s4, v4 ; GFX1100_W64-NEXT: v_readfirstlane_b32 s5, v5 @@ -1040,15 +1032,16 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen +; GFX1100_W64-NEXT: s_xor_b64 s[4:5], exec, s[0:1] ; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1100_W64-NEXT: ; implicit-def: $vgpr0 -; GFX1100_W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1100_W64-NEXT: s_cbranch_execnz .LBB2_4 -; GFX1100_W64-NEXT: ; %bb.5: -; GFX1100_W64-NEXT: s_mov_b64 exec, s[8:9] -; GFX1100_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100_W64-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1100_W64-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX1100_W64-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1100_W64-NEXT: ; %bb.5: ; GFX1100_W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1100_W64-NEXT: .LBB2_6: ; %bb2 ; GFX1100_W64-NEXT: s_waitcnt vmcnt(0) ; GFX1100_W64-NEXT: global_store_b32 v[11:12], v9, off dlc ; GFX1100_W64-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1078,7 +1071,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: v_mov_b32_e32 v8, v0 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 @@ -1135,17 +1128,15 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: v_writelane_b32 v0, s5, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 ; W64-O0-NEXT: v_writelane_b32 v0, s5, 1 -; W64-O0-NEXT: v_mov_b32_e32 v1, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 2 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 3 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_mov_b32_e32 v0, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload @@ -1167,64 +1158,64 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 6 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 7 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 2 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 3 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 4 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 5 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 8 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 9 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 6 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 7 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB2_1 Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 9 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s6, v1, 6 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 7 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 2 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 3 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 4 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 5 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s4 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; W64-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB2_1 ; W64-O0-NEXT: ; %bb.3: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s6, v0, 2 -; W64-O0-NEXT: v_readlane_b32 s7, v0, 3 -; W64-O0-NEXT: s_mov_b64 exec, s[6:7] ; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_and_b32_e64 v2, v2, s5 -; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 +; W64-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, s4 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 10 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 11 +; W64-O0-NEXT: s_mov_b64 s[6:7], exec +; W64-O0-NEXT: v_writelane_b32 v0, s6, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s7, 9 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execz .LBB2_8 -; W64-O0-NEXT: ; %bb.4: ; %bb1 +; W64-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; W64-O0-NEXT: s_cmov_b64 exec, s[4:5] +; W64-O0-NEXT: s_cbranch_scc1 .LBB2_4 +; W64-O0-NEXT: s_branch .LBB2_8 +; W64-O0-NEXT: .LBB2_4: ; %bb1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: v_readlane_b32 s4, v0, 0 @@ -1251,18 +1242,16 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 12 -; W64-O0-NEXT: v_mov_b32_e32 v1, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 13 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 14 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 10 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_mov_b32_e32 v0, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 s[4:5], exec ; W64-O0-NEXT: .LBB2_5: ; =>This Inner Loop Header: Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -1284,55 +1273,53 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 15 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 16 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 17 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 18 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 11 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 12 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 13 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 14 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 19 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 20 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 15 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 16 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 19 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 20 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 15 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 16 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 17 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 18 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 12 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 15 +; W64-O0-NEXT: v_readlane_b32 s7, v1, 16 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 11 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 12 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 10 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_nop 2 -; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen +; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s4 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] -; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 +; W64-O0-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; W64-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; W64-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; W64-O0-NEXT: s_cbranch_scc1 .LBB2_5 ; W64-O0-NEXT: ; %bb.7: ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 14 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] +; W64-O0-NEXT: v_readlane_b32 s4, v1, 8 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 9 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: .LBB2_8: ; %bb2 ; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 11 -; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir index ece2e1b653d34..e7c9e33887a8d 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -62,11 +62,10 @@ body: | ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W64-NEXT: {{ $}} ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; W64-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .3: - ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_IDXEN]] ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -107,11 +106,10 @@ body: | ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W32-NEXT: {{ $}} ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; W32-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.1, implicit $exec ; W32-NEXT: {{ $}} ; W32-NEXT: .3: - ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_IDXEN]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -177,11 +175,10 @@ body: | ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W64-NEXT: {{ $}} ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; W64-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .3: - ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -222,11 +219,10 @@ body: | ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W32-NEXT: {{ $}} ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; W32-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.1, implicit $exec ; W32-NEXT: {{ $}} ; W32-NEXT: .3: - ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -292,11 +288,10 @@ body: | ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W64-NEXT: {{ $}} ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; W64-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .3: - ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -337,11 +332,10 @@ body: | ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W32-NEXT: {{ $}} ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; W32-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.1, implicit $exec ; W32-NEXT: {{ $}} ; W32-NEXT: .3: - ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -512,11 +506,10 @@ body: | ; W64-NO-ADDR64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W64-NO-ADDR64-NEXT: {{ $}} ; W64-NO-ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W64-NO-ADDR64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; W64-NO-ADDR64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W64-NO-ADDR64-NEXT: [[S_XOR_B64_term:%[0-9]+]]:sreg_64_xexec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; W64-NO-ADDR64-NEXT: SI_WATERFALL_LOOP [[S_XOR_B64_term]], [[S_AND_SAVEEXEC_B64_]], %bb.1, implicit $exec ; W64-NO-ADDR64-NEXT: {{ $}} ; W64-NO-ADDR64-NEXT: .3: - ; W64-NO-ADDR64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] ; W64-NO-ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W64-NO-ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]] ; W64-NO-ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -557,11 +550,10 @@ body: | ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; W32-NEXT: {{ $}} ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec - ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc - ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; W32-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; W32-NEXT: SI_WATERFALL_LOOP [[S_XOR_B32_term]], [[S_AND_SAVEEXEC_B32_]], %bb.1, implicit $exec ; W32-NEXT: {{ $}} ; W32-NEXT: .3: - ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]] ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 1e9994dd8e6ef..dc338ce1cc9c9 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -24,10 +24,11 @@ define void @lsr_order_mul24_0(i32 %arg, i32 %arg2, i32 %arg6, i32 %arg13, i32 % ; GFX9-NEXT: v_add_u32_e32 v5, v5, v0 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX9-NEXT: ; %bb.2: ; %.loopexit -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] bb: @@ -55,10 +56,12 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, 1, v18 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX9-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 ; GFX9-NEXT: ; %bb.1: ; %bb19 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6 ; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 @@ -94,14 +97,17 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: global_load_dword v3, v[18:19], off ; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 ; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GFX9-NEXT: s_andn2_b64 s[6:7], exec, s[10:11] +; GFX9-NEXT: s_and_b64 s[12:13], s[6:7], -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] ; GFX9-NEXT: ds_write_b32 v6, v3 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GFX9-NEXT: s_cbranch_execnz .LBB1_2 -; GFX9-NEXT: .LBB1_3: ; %Flow2 +; GFX9-NEXT: s_cselect_b64 exec, s[6:7], s[10:11] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX9-NEXT: ; %bb.3: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: .LBB1_4: ; %Flow2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 4eefff504f19e..599a2ef4683a3 100644 --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx600 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll index 16de2c0c6de08..00e5a85632497 100644 --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=amdgcn-- -lowerswitch -structurizecfg -si-annotate-control-flow < %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s @@ -24,22 +25,20 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) { ; OPT-NEXT: [[TMP3]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ undef, [[LOOP]] ] ; OPT-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] ; OPT-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ] -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) ; OPT-NEXT: [[TMP6]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN]]) ; OPT-NEXT: [[TMP7:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP6]]) ; OPT-NEXT: [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN2]]) ; OPT-NEXT: br i1 [[TMP7]], label [[FLOW1]], label [[LOOP]] ; OPT: Flow1: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]]) ; OPT-NEXT: [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]]) ; OPT-NEXT: br i1 [[TMP9]], label [[IF:%.*]], label [[LOOP_OUTER]] ; OPT: IF: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) ; OPT-NEXT: ret void ; OPT: ENDIF: ; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1 ; OPT-NEXT: [[TMP51]] = icmp eq i32 [[TMP47]], [[CONT:%.*]] ; OPT-NEXT: [[TMP51_INV]] = xor i1 [[TMP51]], true +; OPT-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP2]]) ; OPT-NEXT: br label [[FLOW]] ; ; GCN-LABEL: multi_else_break: @@ -158,7 +157,6 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { ; OPT-NEXT: [[TMP10]] = phi i1 [ [[CMP1]], [[CASE0]] ], [ [[TMP7]], [[LEAFBLOCK]] ] ; OPT-NEXT: br label [[FLOW4]] ; OPT: bb9: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP4]]) ; OPT-NEXT: ret void ; ; GCN-LABEL: multi_if_break_loop: diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll index f6e3509eb029b..dfd438f37c77b 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s @@ -93,7 +94,6 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocap ; IR: bb23: ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]]) ; IR-NEXT: ret void -; bb: %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %my.tmp1 = getelementptr inbounds i64, ptr addrspace(3) %arg, i32 %my.tmp @@ -277,7 +277,6 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]]) ; IR-NEXT: store volatile i32 0, ptr addrspace(1) undef, align 4 ; IR-NEXT: ret void -; bb: %my.tmp1134 = load volatile i32, ptr addrspace(1) undef %my.tmp1235 = icmp slt i32 %my.tmp1134, 9 diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll index d62f045674ace..a390212e9f753 100644 --- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll @@ -16,17 +16,21 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX10-NEXT: .p2align 6 ; GFX10-NEXT: .LBB0_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_and_b32 s0, exec_lo, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_or_b32 s1, s0, s1 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 -; GFX10-NEXT: s_cbranch_execz .LBB0_4 +; GFX10-NEXT: s_andn2_b32 s0, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s3, s0, -1 +; GFX10-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX10-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX10-NEXT: .LBB0_2: ; %bb ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_or_b32 s2, s2, exec_lo -; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB0_1 +; GFX10-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_and_b32 s5, s0, -1 +; GFX10-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX10-NEXT: ; %bb.3: ; %branch2_merge ; GFX10-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX10-NEXT: s_mov_b32 s5, s4 @@ -47,6 +51,7 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX10-NEXT: v_cmp_le_f32_e64 s0, 0, v1 ; GFX10-NEXT: s_and_b32 s0, s0, exec_lo ; GFX10-NEXT: s_or_b32 s2, s2, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-NEXT: s_branch .LBB0_1 ; GFX10-NEXT: .LBB0_4: ; %loop0_merge ; GFX10-NEXT: s_inst_prefetch 0x2 @@ -63,18 +68,22 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX12-NEXT: s_branch .LBB0_2 ; GFX12-NEXT: .LBB0_1: ; %Flow ; GFX12-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_and_b32 s0, exec_lo, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v1, v0 ; GFX12-NEXT: s_or_b32 s1, s0, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execz .LBB0_4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 s0, exec_lo, s1 +; GFX12-NEXT: s_and_b32 s3, s0, -1 +; GFX12-NEXT: s_cselect_b32 exec_lo, s0, s1 +; GFX12-NEXT: s_cbranch_scc0 .LBB0_4 ; GFX12-NEXT: .LBB0_2: ; %bb ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_or_b32 s2, s2, exec_lo -; GFX12-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX12-NEXT: s_cbranch_execz .LBB0_1 +; GFX12-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_and_b32 s5, s0, -1 +; GFX12-NEXT: s_cmov_b32 exec_lo, s0 +; GFX12-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX12-NEXT: ; %bb.3: ; %branch2_merge ; GFX12-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX12-NEXT: s_mov_b32 s5, s4 @@ -97,6 +106,7 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) { ; GFX12-NEXT: s_and_b32 s0, s0, exec_lo ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b32 s2, s2, s0 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12-NEXT: s_branch .LBB0_1 ; GFX12-NEXT: .LBB0_4: ; %loop0_merge ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index ba012b208c957..24bba7bf97cd4 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -219,71 +219,83 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; MUBUF-LABEL: func_non_entry_block_static_alloca_align4: ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_mov_b32 s7, s33 -; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; MUBUF-NEXT: s_mov_b32 s10, s33 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc -; MUBUF-NEXT: s_cbranch_execz .LBB2_3 +; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; MUBUF-NEXT: s_mov_b64 s[4:5], exec +; MUBUF-NEXT: s_and_b64 s[6:7], vcc, -1 +; MUBUF-NEXT: s_cmov_b64 exec, vcc +; MUBUF-NEXT: s_cbranch_scc0 .LBB2_4 ; MUBUF-NEXT: ; %bb.1: ; %bb.0 ; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; MUBUF-NEXT: s_and_b64 exec, exec, vcc -; MUBUF-NEXT: s_cbranch_execz .LBB2_3 +; MUBUF-NEXT: s_mov_b64 s[6:7], exec +; MUBUF-NEXT: s_and_b64 s[8:9], vcc, -1 +; MUBUF-NEXT: s_cmov_b64 exec, vcc +; MUBUF-NEXT: s_cbranch_scc0 .LBB2_3 ; MUBUF-NEXT: ; %bb.2: ; %bb.1 -; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: s_add_i32 s8, s32, 0x1000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 -; MUBUF-NEXT: v_mov_b32_e32 v3, s6 +; MUBUF-NEXT: v_mov_b32_e32 v3, s8 ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; MUBUF-NEXT: v_mov_b32_e32 v2, 1 ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6 +; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s8 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 -; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: s_mov_b32 s32, s8 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 ; MUBUF-NEXT: global_store_dword v[0:1], v2, off -; MUBUF-NEXT: .LBB2_3: ; %bb.2 +; MUBUF-NEXT: s_or_b64 exec, exec, s[6:7] +; MUBUF-NEXT: .LBB2_3: ; %Flow ; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] +; MUBUF-NEXT: .LBB2_4: ; %bb.2 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 -; MUBUF-NEXT: s_mov_b32 s33, s7 +; MUBUF-NEXT: s_mov_b32 s33, s10 ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4: ; FLATSCR: ; %bb.0: ; %entry ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 s3, s33 -; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; FLATSCR-NEXT: s_mov_b32 s6, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc -; FLATSCR-NEXT: s_cbranch_execz .LBB2_3 +; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; FLATSCR-NEXT: s_mov_b64 s[0:1], exec +; FLATSCR-NEXT: s_and_b64 s[2:3], vcc, -1 +; FLATSCR-NEXT: s_cmov_b64 exec, vcc +; FLATSCR-NEXT: s_cbranch_scc0 .LBB2_4 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; FLATSCR-NEXT: s_and_b64 exec, exec, vcc -; FLATSCR-NEXT: s_cbranch_execz .LBB2_3 +; FLATSCR-NEXT: s_mov_b64 s[2:3], exec +; FLATSCR-NEXT: s_and_b64 s[4:5], vcc, -1 +; FLATSCR-NEXT: s_cmov_b64 exec, vcc +; FLATSCR-NEXT: s_cbranch_scc0 .LBB2_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 -; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 +; FLATSCR-NEXT: s_add_i32 s4, s32, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 1 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 -; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s4 +; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 -; FLATSCR-NEXT: s_mov_b32 s32, s2 +; FLATSCR-NEXT: s_mov_b32 s32, s4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off -; FLATSCR-NEXT: .LBB2_3: ; %bb.2 +; FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] +; FLATSCR-NEXT: .LBB2_3: ; %Flow ; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; FLATSCR-NEXT: .LBB2_4: ; %bb.2 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 -; FLATSCR-NEXT: s_mov_b32 s33, s3 +; FLATSCR-NEXT: s_mov_b32 s33, s6 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: @@ -316,13 +328,15 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; MUBUF-LABEL: func_non_entry_block_static_alloca_align64: ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MUBUF-NEXT: s_mov_b32 s7, s33 +; MUBUF-NEXT: s_mov_b32 s8, s33 ; MUBUF-NEXT: s_add_i32 s33, s32, 0xfc0 -; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfffff000 ; MUBUF-NEXT: s_addk_i32 s32, 0x2000 -; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc -; MUBUF-NEXT: s_cbranch_execz .LBB3_2 +; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; MUBUF-NEXT: s_mov_b64 s[4:5], exec +; MUBUF-NEXT: s_and_b64 s[6:7], vcc, -1 +; MUBUF-NEXT: s_cmov_b64 exec, vcc +; MUBUF-NEXT: s_cbranch_scc0 .LBB3_2 ; MUBUF-NEXT: ; %bb.1: ; %bb.0 ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 ; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 @@ -338,25 +352,27 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 ; MUBUF-NEXT: global_store_dword v[0:1], v2, off -; MUBUF-NEXT: .LBB3_2: ; %bb.1 ; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] +; MUBUF-NEXT: .LBB3_2: ; %bb.1 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_addk_i32 s32, 0xe000 -; MUBUF-NEXT: s_mov_b32 s33, s7 +; MUBUF-NEXT: s_mov_b32 s33, s8 ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64: ; FLATSCR: ; %bb.0: ; %entry ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: s_mov_b32 s3, s33 +; FLATSCR-NEXT: s_mov_b32 s4, s33 ; FLATSCR-NEXT: s_add_i32 s33, s32, 63 -; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; FLATSCR-NEXT: s_andn2_b32 s33, s33, 63 ; FLATSCR-NEXT: s_addk_i32 s32, 0x80 -; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc -; FLATSCR-NEXT: s_cbranch_execz .LBB3_2 +; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; FLATSCR-NEXT: s_mov_b64 s[0:1], exec +; FLATSCR-NEXT: s_and_b64 s[2:3], vcc, -1 +; FLATSCR-NEXT: s_cmov_b64 exec, vcc +; FLATSCR-NEXT: s_cbranch_scc0 .LBB3_2 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 @@ -370,13 +386,13 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off -; FLATSCR-NEXT: .LBB3_2: ; %bb.1 ; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; FLATSCR-NEXT: .LBB3_2: ; %bb.1 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_addk_i32 s32, 0xff80 -; FLATSCR-NEXT: s_mov_b32 s33, s3 +; FLATSCR-NEXT: s_mov_b32 s33, s4 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %arg.cond, 0 @@ -406,3 +422,7 @@ attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amd !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; ASSUME1024: {{.*}} +; DEFAULTSIZE: {{.*}} +; DEFAULTSIZE-V5: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir index 748775dc2cf1d..2c20af85c6966 100644 --- a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -115,9 +115,9 @@ body: | ; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub1_sub2_sub3_sub4_sub5, killed [[REG_SEQUENCE3]], %subreg.sub1_sub2_sub3_sub4_sub5_sub6 ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 killed [[V_MOV_B32_e32_1]], [[V_LSHL_B64_e64_]], killed [[REG_SEQUENCE4]], 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2.bb2: - ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_ENDPGM 0 bb.0.bb: successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) @@ -158,9 +158,9 @@ body: | %29 = V_MOV_B32_e32 0, implicit $exec %30 = COPY %24 BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, implicit $exec + SI_WAVE_RECONVERGE %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2.bb2: - SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -257,9 +257,9 @@ body: | ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_LSHL_B64_]] ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 killed [[V_MOV_B32_e32_2]], killed [[COPY3]], killed [[REG_SEQUENCE5]], 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2.bb2: - ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_ENDPGM 0 bb.0.bb: successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) @@ -308,9 +308,9 @@ body: | %38 = V_MOV_B32_e32 0, implicit $exec %39 = COPY %33 BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, implicit $exec + SI_WAVE_RECONVERGE %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2.bb2: - SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... @@ -388,9 +388,9 @@ body: | ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub1_sub2_sub3_sub4_sub5, killed [[REG_SEQUENCE2]], %subreg.sub1_sub2_sub3_sub4_sub5_sub6 ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 killed [[V_MOV_B32_e32_1]], [[V_LSHL_B64_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2.bb2: - ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_ENDPGM 0 bb.0.bb: successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) @@ -431,9 +431,9 @@ body: | %29 = V_MOV_B32_e32 0, implicit $exec %30 = COPY %24 BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, implicit $exec + SI_WAVE_RECONVERGE %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2.bb2: - SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll index 538ce15979de8..cbfcfea15df60 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s ; CHECK: .amdgpu_pal_metadata diff --git a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir index 83c30507ce3ce..94537ad984044 100644 --- a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir +++ b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir @@ -1,3 +1,4 @@ +# XFAIL: * # RUN: llc -mtriple amdgcn -run-pass livevars -run-pass phi-node-elimination -verify-machineinstrs -o - %s | FileCheck %s # CHECK-LABEL: phi-cf-test @@ -36,7 +37,6 @@ body: | %24:sreg_64 = PHI %20, %bb.3, %22, %bb.0 %23:vgpr_32 = PHI %19, %bb.3, %18, %bb.0 - SI_END_CF %24, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %3:vgpr_32, dead %10:sreg_64 = nsw V_ADD_CO_U32_e64 1, %23, 0, implicit $exec bb.3: @@ -48,7 +48,11 @@ body: | %17:sgpr_128 = REG_SEQUENCE undef %14:sreg_32_xm0, %subreg.sub0, undef %12:sreg_32_xm0, %subreg.sub1, %16, %subreg.sub2, %15, %subreg.sub3 BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) %19:vgpr_32 = COPY %4 - %20:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + %20:sreg_64 = SI_IF %0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 + bb.4: + SI_WAVE_RECONVERGE %24, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.2 + ... diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index b068d87c4d6f4..1076d136b1b9a 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -71,9 +71,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v13, v7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; GFX9-NEXT: v_mov_b32_e32 v22, v20 +; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_mov_b32_e32 v22, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 @@ -82,13 +83,15 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX9-NEXT: v_cndmask_b32_e64 v11, v1, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v12, v0, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, 0, s[4:5] +; GFX9-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB0_6 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB0_6 ; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 1, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v7, vcc @@ -106,21 +109,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v8, v10, v12 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 -; GFX9-NEXT: v_lshlrev_b64 v[12:13], v13, v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_lshlrev_b64 v[12:13], v13, v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, v13, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[4:5] -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB0_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-NEXT: v_sub_u32_e32 v10, 64, v24 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[2:3] @@ -183,16 +187,17 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v19, v25, v27 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v19, v9 ; GFX9-NEXT: v_or3_b32 v7, v7, 0, v11 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX9-NEXT: v_mov_b32_e32 v18, v8 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB0_3 +; GFX9-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX9-NEXT: ; %bb.4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB0_5: ; %Flow2 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB0_5: ; %Flow2 ; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[12:13] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v13 @@ -200,8 +205,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or3_b32 v12, v6, v12, v10 ; GFX9-NEXT: v_or_b32_e32 v10, v9, v15 ; GFX9-NEXT: v_or_b32_e32 v13, v8, v14 -; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: .LBB0_6: ; %udiv-end ; GFX9-NEXT: v_mul_lo_u32 v16, v13, v5 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v15, 0 @@ -243,8 +248,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane @@ -572,32 +577,31 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB0_3 -; GFX9-O0-NEXT: s_branch .LBB0_8 +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB0_7 +; GFX9-O0-NEXT: s_branch .LBB0_2 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 5 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload @@ -619,15 +623,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_5 -; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB0_4 +; GFX9-O0-NEXT: .LBB0_2: ; %Flow2 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -639,8 +637,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_9 -; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit +; GFX9-O0-NEXT: s_branch .LBB0_8 +; GFX9-O0-NEXT: .LBB0_3: ; %udiv-loop-exit +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 3 ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -649,13 +653,13 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s6, v[0:1] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] -; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s6, v[9:10] +; GFX9-O0-NEXT: s_mov_b32 s6, 63 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s6, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 @@ -679,15 +683,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_3 -; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB0_2 +; GFX9-O0-NEXT: .LBB0_4: ; %Flow1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -709,15 +707,15 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_4 -; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while +; GFX9-O0-NEXT: s_branch .LBB0_3 +; GFX9-O0-NEXT: .LBB0_5: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 6 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 7 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload @@ -878,7 +876,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill @@ -899,12 +897,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -932,10 +927,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 +; GFX9-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX9-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_1 -; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader +; GFX9-O0-NEXT: .LBB0_6: ; %udiv-preheader ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload @@ -1038,8 +1035,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -1067,8 +1064,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB0_6 -; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_branch .LBB0_5 +; GFX9-O0-NEXT: .LBB0_7: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -1193,18 +1190,17 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 -; GFX9-O0-NEXT: s_branch .LBB0_7 -; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB0_6 +; GFX9-O0-NEXT: s_branch .LBB0_4 +; GFX9-O0-NEXT: .LBB0_8: ; %udiv-end ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -1497,8 +1493,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -1551,6 +1549,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v11, vcc ; GFX9-NEXT: v_subbrev_co_u32_e32 v11, vcc, 0, v11, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc @@ -1564,13 +1563,15 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX9-NEXT: v_cndmask_b32_e64 v15, v3, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v13, v1, 0, s[4:5] +; GFX9-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v0, 0, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_6 +; GFX9-NEXT: s_cmov_b64 exec, s[6:7] +; GFX9-NEXT: s_cbranch_scc0 .LBB1_6 ; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 1, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v9, vcc @@ -1589,20 +1590,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v11, v11, v13 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15 -; GFX9-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: s_and_b64 s[10:11], vcc, -1 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: s_cmov_b64 exec, vcc +; GFX9-NEXT: s_cbranch_scc0 .LBB1_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-NEXT: v_sub_u32_e32 v14, 64, v22 ; GFX9-NEXT: v_lshrrev_b64 v[12:13], v22, v[0:1] @@ -1659,22 +1661,23 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc ; GFX9-NEXT: v_or_b32_e32 v11, v21, v11 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v20, v22, v24 ; GFX9-NEXT: v_or_b32_e32 v21, v23, v25 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v30 +; GFX9-NEXT: s_andn2_b64 s[10:11], exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v21, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_and_b64 s[12:13], s[10:11], -1 ; GFX9-NEXT: v_mov_b32_e32 v20, v12 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: s_cselect_b64 exec, s[10:11], s[4:5] +; GFX9-NEXT: s_cbranch_scc1 .LBB1_3 ; GFX9-NEXT: ; %bb.4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: .LBB1_5: ; %Flow2 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: .LBB1_5: ; %Flow2 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[10:11] ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 31, v11 @@ -1682,8 +1685,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or3_b32 v14, v8, v10, v14 ; GFX9-NEXT: v_or_b32_e32 v13, v13, v17 ; GFX9-NEXT: v_or_b32_e32 v12, v12, v16 -; GFX9-NEXT: .LBB1_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: .LBB1_6: ; %udiv-end ; GFX9-NEXT: v_mul_lo_u32 v19, v12, v7 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v12, 0 ; GFX9-NEXT: v_mov_b32_e32 v17, 0 @@ -1717,8 +1720,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane @@ -1970,32 +1973,31 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB1_3 -; GFX9-O0-NEXT: s_branch .LBB1_8 +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_7 +; GFX9-O0-NEXT: s_branch .LBB1_2 ; GFX9-O0-NEXT: .LBB1_1: ; %Flow ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 4 +; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 5 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -2017,15 +2019,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_5 -; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v4, 2 -; GFX9-O0-NEXT: v_readlane_b32 s5, v4, 3 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB1_4 +; GFX9-O0-NEXT: .LBB1_2: ; %Flow2 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2037,8 +2033,14 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_9 -; GFX9-O0-NEXT: .LBB1_4: ; %udiv-loop-exit +; GFX9-O0-NEXT: s_branch .LBB1_8 +; GFX9-O0-NEXT: .LBB1_3: ; %udiv-loop-exit +; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 2 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 3 ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload @@ -2047,13 +2049,13 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s6, v[0:1] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s4, v[9:10] -; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_lshlrev_b64 v[9:10], s6, v[9:10] +; GFX9-O0-NEXT: s_mov_b32 s6, 63 +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s6, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 @@ -2077,15 +2079,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_3 -; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 -; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-O0-NEXT: s_branch .LBB1_2 +; GFX9-O0-NEXT: .LBB1_4: ; %Flow1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -2107,15 +2103,15 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_4 -; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while +; GFX9-O0-NEXT: s_branch .LBB1_3 +; GFX9-O0-NEXT: .LBB1_5: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 -; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 +; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 6 +; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 7 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload @@ -2276,7 +2272,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v19 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[12:13] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-O0-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -2297,12 +2293,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 9 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -2330,10 +2323,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6 +; GFX9-O0-NEXT: s_andn2_b64 s[4:5], exec, s[6:7] +; GFX9-O0-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX9-O0-NEXT: s_cselect_b64 exec, s[4:5], s[6:7] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_1 -; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader +; GFX9-O0-NEXT: .LBB1_6: ; %udiv-preheader ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload @@ -2436,8 +2431,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s6 -; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 8 -; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 9 +; GFX9-O0-NEXT: v_writelane_b32 v16, s4, 6 +; GFX9-O0-NEXT: v_writelane_b32 v16, s5, 7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -2465,8 +2460,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_branch .LBB1_6 -; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-O0-NEXT: s_branch .LBB1_5 +; GFX9-O0-NEXT: .LBB1_7: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -2591,18 +2586,17 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 7 +; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 -; GFX9-O0-NEXT: s_branch .LBB1_7 -; GFX9-O0-NEXT: .LBB1_9: ; %udiv-end +; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_6 +; GFX9-O0-NEXT: s_branch .LBB1_4 +; GFX9-O0-NEXT: .LBB1_8: ; %udiv-end ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -2859,8 +2853,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll index 0c67f00d7bebf..f57e86c68ebf9 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll @@ -35,8 +35,6 @@ define <4 x float> @needs_extimg(float noundef %0, float noundef %1, <8 x i32> n ; IR: define void @caller( define void @caller(float noundef %0, float noundef %1, <8 x i32> noundef %2, <4 x i32> noundef %3) { - ; EXTIMG: call void @needs_extimg( - ; NOEXTIMG: call void null call void @needs_extimg(float %0, float %1, <8 x i32> %2, <4 x i32> %3) ; IR: ret void ret void @@ -45,3 +43,6 @@ define void @caller(float noundef %0, float noundef %1, <8 x i32> noundef %2, <4 declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) attributes #0 = { "target-features"="+extended-image-insts" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; EXTIMG: {{.*}} +; NOEXTIMG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll index ad38d78ddb2ff..4b77d6c48512a 100644 --- a/llvm/test/CodeGen/AMDGPU/ret_jump.ll +++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll index 8cb1d250a6fa7..68eb12ee4fea2 100644 --- a/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll @@ -12,13 +12,14 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) { ; GFX900-NEXT: s_mov_b64 s[4:5], exec ; GFX900-NEXT: s_wqm_b64 exec, exec ; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: s_mov_b32 s0, 0 ; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX900-NEXT: s_xor_b64 s[6:7], vcc, exec +; GFX900-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX900-NEXT: s_mov_b32 s0, 0 ; GFX900-NEXT: ; implicit-def: $vgpr0 ; GFX900-NEXT: ; implicit-def: $sgpr2 -; GFX900-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX900-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX900-NEXT: s_cbranch_execz .LBB0_2 +; GFX900-NEXT: s_cmov_b64 exec, vcc +; GFX900-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX900-NEXT: ; %bb.1: ; %bb1 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_mov_b32 s1, s0 @@ -34,13 +35,14 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) { ; GFX900-NEXT: s_mov_b32 s15, s0 ; GFX900-NEXT: image_sample v[0:1], v[0:1], s[8:15], s[0:3] dmask:0x3 ; GFX900-NEXT: s_mov_b32 s2, 1.0 +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX900-NEXT: .LBB0_2: ; %Flow -; GFX900-NEXT: s_or_saveexec_b64 s[0:1], s[6:7] ; GFX900-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX900-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GFX900-NEXT: s_xor_b64 s[0:1], s[6:7], exec +; GFX900-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX900-NEXT: v_mov_b32_e32 v2, s2 -; GFX900-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX900-NEXT: s_cbranch_execz .LBB0_5 +; GFX900-NEXT: s_cmov_b64 exec, s[6:7] +; GFX900-NEXT: s_cbranch_scc0 .LBB0_5 ; GFX900-NEXT: ; %bb.3: ; %bb5 ; GFX900-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GFX900-NEXT: s_cbranch_scc0 .LBB0_6 @@ -49,8 +51,8 @@ define amdgpu_ps void @_amdgpu_ps_main(float %arg) { ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: .LBB0_5: ; %bb6 ; GFX900-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX900-NEXT: .LBB0_5: ; %bb6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cvt_pkrtz_f16_f32 v1, 0, v1 ; GFX900-NEXT: v_cvt_pkrtz_f16_f32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index b086640c72f80..13496567c1228 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -366,41 +366,45 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_xor_b32_e32 v1, v13, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7] ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 -; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v6 -; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v7 ; GCN-IR-NEXT: v_min_u32_e32 v11, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v10, v11 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7] -; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v10, v11 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[8:9], -1 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_mov_b64 s[6:7], exec ; GCN-IR-NEXT: v_mov_b32_e32 v14, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v15, v13 -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v7, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v7, 0, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[8:9] +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[6:7], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[8:9], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v1, vcc @@ -418,34 +422,35 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v16, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v17, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: .LBB1_6: ; %udiv-end ; GCN-IR-NEXT: v_xor_b32_e32 v0, v13, v12 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v15, v14 ; GCN-IR-NEXT: v_xor_b32_e32 v3, v4, v0 @@ -1510,22 +1515,26 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec ; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB11_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc @@ -1541,34 +1550,35 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB11_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-IR-NEXT: .LBB11_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB11_6: ; %udiv-end ; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v12 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v13 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 @@ -1704,23 +1714,27 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec ; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 +; GCN-IR-NEXT: s_and_b64 s[10:11], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc @@ -1736,34 +1750,35 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v15, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB12_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-IR-NEXT: .LBB12_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB12_6: ; %udiv-end ; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v12 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v13 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 @@ -1800,26 +1815,30 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[0:1] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v10 +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[0:1] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], s[6:7] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec +; GCN-IR-NEXT: v_mov_b32_e32 v11, v10 ; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v5, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB13_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB13_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v0 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v0 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], v0 ; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB13_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[4:5], v6 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffcf, v8 @@ -1844,23 +1863,24 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 ; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 ; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 ; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB13_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v3, v1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v2, v0 -; GCN-IR-NEXT: .LBB13_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB13_6: ; %udiv-end ; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v10 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v3, v11 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll index 81858bd3d29ee..528ae819579de 100644 --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -6,14 +6,19 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GCN-NEXT: s_mov_b32 s0, exec_lo +; GCN-NEXT: s_and_b32 s1, vcc_lo, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %.bb0 ; GCN-NEXT: v_mov_b32_e32 v3, 1 -; GCN-NEXT: ; %bb.2: ; %.merge ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GCN-NEXT: .LBB0_2: ; %.merge ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0 -; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GCN-NEXT: s_cbranch_execz .LBB0_4 +; GCN-NEXT: s_mov_b32 s0, exec_lo +; GCN-NEXT: s_and_b32 s1, vcc_lo, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %.then ; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo @@ -27,9 +32,9 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i ; GCN-NEXT: v_mov_b32_e32 v4, -1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: buffer_store_dword v4, v0, s[4:7], 0 offen -; GCN-NEXT: .LBB0_4: ; %.end ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GCN-NEXT: .LBB0_4: ; %.end ; GCN-NEXT: v_mov_b32_e32 v0, -1 ; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen ; GCN-NEXT: s_endpgm @@ -65,21 +70,20 @@ define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrs ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GCN-NEXT: s_mov_b32 s0, exec_lo +; GCN-NEXT: s_and_b32 s1, vcc_lo, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %.bb0 ; GCN-NEXT: v_mov_b32_e32 v3, 1 -; GCN-NEXT: ; %bb.2: ; %.merge ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GCN-NEXT: .LBB1_2: ; %.merge ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0 -; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GCN-NEXT: s_xor_b32 s0, exec_lo, s0 -; GCN-NEXT: s_cbranch_execnz .LBB1_5 -; GCN-NEXT: ; %bb.3: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b32 s0, s0 -; GCN-NEXT: s_cbranch_execnz .LBB1_6 -; GCN-NEXT: .LBB1_4: ; %.end -; GCN-NEXT: s_endpgm -; GCN-NEXT: .LBB1_5: ; %.else +; GCN-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GCN-NEXT: s_and_b32 s1, vcc_lo, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB1_4 +; GCN-NEXT: ; %bb.3: ; %.else ; GCN-NEXT: s_or_saveexec_b32 s1, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 exec_lo, s1 @@ -94,11 +98,17 @@ define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrs ; GCN-NEXT: v_mov_b32_e32 v3, -1 ; GCN-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen ; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b32 s0, s0 -; GCN-NEXT: s_cbranch_execz .LBB1_4 -; GCN-NEXT: .LBB1_6: ; %.then +; GCN-NEXT: s_waitcnt_depctr 0xffe3 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GCN-NEXT: .LBB1_4: ; %Flow +; GCN-NEXT: s_xor_b32 s1, s0, exec_lo +; GCN-NEXT: s_and_b32 s1, s0, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, s0 +; GCN-NEXT: s_cbranch_scc0 .LBB1_6 +; GCN-NEXT: ; %bb.5: ; %.then ; GCN-NEXT: v_mov_b32_e32 v0, -1 ; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen +; GCN-NEXT: .LBB1_6: ; %.end ; GCN-NEXT: s_endpgm .entry: %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/setcc-sext.ll b/llvm/test/CodeGen/AMDGPU/setcc-sext.ll index 4432ac4a9e8ff..dbd8524cb7819 100644 --- a/llvm/test/CodeGen/AMDGPU/setcc-sext.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-sext.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}setcc_sgt_true_sext: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index b67ecc2f9d13c..b7495b361c712 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -108,27 +108,30 @@ endif: define amdgpu_kernel void @sgpr_if_else_valu_br(ptr addrspace(1) %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_valu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xc ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xc ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: s_xor_b64 s[2:3], vcc, exec +; SI-NEXT: s_and_b64 s[8:9], vcc, -1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB2_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s8, s6, s7 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: .LBB2_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xor_b64 s[6:7], s[2:3], exec +; SI-NEXT: s_and_b64 s[10:11], s[2:3], -1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_xor_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB2_4 +; SI-NEXT: s_cmov_b64 exec, s[2:3] +; SI-NEXT: s_cbranch_scc0 .LBB2_4 ; SI-NEXT: ; %bb.3: ; %if -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_add_i32 s2, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: .LBB2_4: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -160,13 +163,14 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, p ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[10:11], vcc, exec +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_and_b64 s[8:9], vcc, -1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 -; SI-NEXT: s_and_saveexec_b64 s[10:11], vcc -; SI-NEXT: s_xor_b64 s[10:11], exec, s[10:11] -; SI-NEXT: s_cbranch_execz .LBB3_2 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB3_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -176,10 +180,13 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, p ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_or_b64 exec, exec, s[10:11] ; SI-NEXT: .LBB3_2: ; %Flow ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_andn2_saveexec_b64 s[0:1], s[10:11] -; SI-NEXT: s_cbranch_execz .LBB3_4 +; SI-NEXT: s_xor_b64 s[0:1], s[10:11], exec +; SI-NEXT: s_and_b64 s[2:3], s[10:11], -1 +; SI-NEXT: s_cmov_b64 exec, s[10:11] +; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.3: ; %if ; SI-NEXT: s_mov_b32 s15, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 @@ -191,8 +198,8 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, p ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-NEXT: s_or_b64 s[8:9], s[2:3], s[6:7] -; SI-NEXT: .LBB3_4: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: .LBB3_4: ; %endif ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll index 09e342fe19066..789b520bd34ea 100644 --- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll @@ -8,37 +8,49 @@ define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 i ; GCN-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 ; GCN-NEXT: v_cmp_ne_u32_e64 s6, 0, v2 ; GCN-NEXT: s_mov_b32 s7, 0 -; GCN-NEXT: s_branch .LBB0_2 -; GCN-NEXT: .LBB0_1: ; %bb4 -; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: s_branch .LBB0_3 +; GCN-NEXT: .LBB0_1: ; %Flow +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GCN-NEXT: .LBB0_2: ; %bb4 +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; GCN-NEXT: s_and_b32 s8, exec_lo, s6 ; GCN-NEXT: s_or_b32 s7, s8, s7 -; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 -; GCN-NEXT: s_cbranch_execz .LBB0_5 -; GCN-NEXT: .LBB0_2: ; %bb +; GCN-NEXT: s_andn2_b32 s8, exec_lo, s7 +; GCN-NEXT: s_and_b32 s9, s8, -1 +; GCN-NEXT: s_cselect_b32 exec_lo, s8, s7 +; GCN-NEXT: s_cbranch_scc0 .LBB0_6 +; GCN-NEXT: .LBB0_3: ; %bb ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GCN-NEXT: s_cbranch_execz .LBB0_1 -; GCN-NEXT: ; %bb.3: ; %bb1 -; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GCN-NEXT: s_mov_b32 s8, exec_lo +; GCN-NEXT: s_and_b32 s10, s9, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, s9 +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 +; GCN-NEXT: ; %bb.4: ; %bb1 +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; GCN-NEXT: s_mov_b32 s9, exec_lo ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: s_or_saveexec_b32 s9, -1 +; GCN-NEXT: s_or_saveexec_b32 s10, -1 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf -; GCN-NEXT: s_mov_b32 exec_lo, s9 +; GCN-NEXT: s_mov_b32 exec_lo, s10 ; GCN-NEXT: v_mov_b32_e32 v0, v4 -; GCN-NEXT: s_and_b32 exec_lo, exec_lo, s5 -; GCN-NEXT: s_cbranch_execz .LBB0_1 -; GCN-NEXT: ; %bb.4: ; %bb2 -; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: s_and_b32 s10, s5, exec_lo +; GCN-NEXT: s_and_b32 s11, s10, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, s10 +; GCN-NEXT: s_cbranch_scc0 .LBB0_1 +; GCN-NEXT: ; %bb.5: ; %bb2 +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; GCN-NEXT: buffer_atomic_add v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt_depctr 0xffe3 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GCN-NEXT: s_branch .LBB0_1 -; GCN-NEXT: .LBB0_5: ; %bb5 +; GCN-NEXT: .LBB0_6: ; %bb5 ; GCN-NEXT: s_endpgm .entry: br label %bb diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll index cef959f45437d..ae8745b5c48ed 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll @@ -7,24 +7,30 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: s_mov_b64 s[0:1], exec -; SI-NEXT: s_mov_b64 s[2:3], -1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[2:3], vcc, -1 +; SI-NEXT: s_mov_b64 s[2:3], -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB0_2 ; SI-NEXT: ; %bb.1: ; %if1 ; SI-NEXT: s_xor_b64 s[2:3], exec, -1 -; SI-NEXT: ; %bb.2: ; %endif1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: .LBB0_2: ; %endif1 ; SI-NEXT: s_wqm_b64 s[4:5], s[2:3] ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], exec ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; SI-NEXT: s_cbranch_scc0 .LBB0_6 ; SI-NEXT: ; %bb.3: ; %endif1 ; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] -; SI-NEXT: s_cbranch_execz .LBB0_5 +; SI-NEXT: s_cmov_b64 exec, s[2:3] +; SI-NEXT: s_cbranch_scc0 .LBB0_5 ; SI-NEXT: ; %bb.4: ; %if2 ; SI-NEXT: s_mov_b32 s3, 0 ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 @@ -36,8 +42,8 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) { ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_cvt_f32_i32_e32 v0, v0 -; SI-NEXT: .LBB0_5: ; %endif2 ; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: .LBB0_5: ; %endif2 ; SI-NEXT: s_branch .LBB0_7 ; SI-NEXT: .LBB0_6: ; SI-NEXT: s_mov_b64 exec, 0 @@ -49,24 +55,30 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) { ; FLAT: ; %bb.0: ; %entry ; FLAT-NEXT: v_cvt_i32_f32_e32 v0, v0 ; FLAT-NEXT: s_mov_b64 s[0:1], exec -; FLAT-NEXT: s_mov_b64 s[2:3], -1 ; FLAT-NEXT: v_or_b32_e32 v0, v1, v0 ; FLAT-NEXT: v_and_b32_e32 v0, 1, v0 ; FLAT-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; FLAT-NEXT: s_and_saveexec_b64 s[4:5], vcc +; FLAT-NEXT: s_xor_b64 s[4:5], vcc, exec +; FLAT-NEXT: s_and_b64 s[2:3], vcc, -1 +; FLAT-NEXT: s_mov_b64 s[2:3], -1 +; FLAT-NEXT: s_cmov_b64 exec, vcc +; FLAT-NEXT: s_cbranch_scc0 .LBB0_2 ; FLAT-NEXT: ; %bb.1: ; %if1 ; FLAT-NEXT: s_xor_b64 s[2:3], exec, -1 -; FLAT-NEXT: ; %bb.2: ; %endif1 ; FLAT-NEXT: s_or_b64 exec, exec, s[4:5] +; FLAT-NEXT: .LBB0_2: ; %endif1 ; FLAT-NEXT: s_wqm_b64 s[4:5], s[2:3] ; FLAT-NEXT: s_xor_b64 s[4:5], s[4:5], exec ; FLAT-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; FLAT-NEXT: s_cbranch_scc0 .LBB0_6 ; FLAT-NEXT: ; %bb.3: ; %endif1 ; FLAT-NEXT: s_and_b64 exec, exec, s[0:1] +; FLAT-NEXT: s_and_b64 s[2:3], s[2:3], exec +; FLAT-NEXT: s_mov_b64 s[0:1], exec +; FLAT-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; FLAT-NEXT: v_mov_b32_e32 v0, 0 -; FLAT-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] -; FLAT-NEXT: s_cbranch_execz .LBB0_5 +; FLAT-NEXT: s_cmov_b64 exec, s[2:3] +; FLAT-NEXT: s_cbranch_scc0 .LBB0_5 ; FLAT-NEXT: ; %bb.4: ; %if2 ; FLAT-NEXT: s_mov_b32 s3, 0 ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 @@ -78,8 +90,8 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) { ; FLAT-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:4 glc ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_cvt_f32_i32_e32 v0, v0 -; FLAT-NEXT: .LBB0_5: ; %endif2 ; FLAT-NEXT: s_or_b64 exec, exec, s[0:1] +; FLAT-NEXT: .LBB0_5: ; %endif2 ; FLAT-NEXT: s_branch .LBB0_7 ; FLAT-NEXT: .LBB0_6: ; FLAT-NEXT: s_mov_b64 exec, 0 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll index 2495c0dff8929..7d06c2b9e3dbc 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll index 165b996981e34..f13f60a00fe96 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll index 1ab63762ecbd7..1198a6e217fd9 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -16,10 +16,11 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_and_b64 s[4:5], exec, vcc ; SI-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] -; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execnz .LBB0_1 +; SI-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; SI-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; SI-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; SI-NEXT: s_cbranch_scc1 .LBB0_1 ; SI-NEXT: ; %bb.2: ; %ENDLOOP -; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -41,10 +42,11 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_and_b64 s[4:5], exec, vcc ; FLAT-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] -; FLAT-NEXT: s_andn2_b64 exec, exec, s[2:3] -; FLAT-NEXT: s_cbranch_execnz .LBB0_1 +; FLAT-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; FLAT-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; FLAT-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; FLAT-NEXT: s_cbranch_scc1 .LBB0_1 ; FLAT-NEXT: ; %bb.2: ; %ENDLOOP -; FLAT-NEXT: s_or_b64 exec, exec, s[2:3] ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 @@ -71,50 +73,56 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, -1 +; SI-NEXT: s_mov_b64 s[6:7], exec ; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SI-NEXT: s_cbranch_execz .LBB1_2 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s0, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_and_b64 s[4:5], s[0:1], exec -; SI-NEXT: .LBB1_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[6:7] -; SI-NEXT: .LBB1_3: ; %loop +; SI-NEXT: .LBB1_2: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_and_b64 s[0:1], exec, s[4:5] ; SI-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execnz .LBB1_3 -; SI-NEXT: ; %bb.4: ; %exit +; SI-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; SI-NEXT: s_and_b64 s[6:7], s[0:1], -1 +; SI-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; SI-NEXT: s_cbranch_scc1 .LBB1_2 +; SI-NEXT: ; %bb.3: ; %exit ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: phi_cond_outside_loop: ; FLAT: ; %bb.0: ; %entry ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; FLAT-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; FLAT-NEXT: s_and_b64 s[4:5], vcc, -1 +; FLAT-NEXT: s_mov_b64 s[6:7], exec ; FLAT-NEXT: s_mov_b64 s[2:3], 0 ; FLAT-NEXT: s_mov_b64 s[4:5], 0 -; FLAT-NEXT: s_and_saveexec_b64 s[6:7], vcc -; FLAT-NEXT: s_cbranch_execz .LBB1_2 +; FLAT-NEXT: s_cmov_b64 exec, vcc +; FLAT-NEXT: s_cbranch_scc0 .LBB1_2 ; FLAT-NEXT: ; %bb.1: ; %else ; FLAT-NEXT: s_load_dword s0, s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: s_cmp_eq_u32 s0, 0 ; FLAT-NEXT: s_cselect_b64 s[0:1], -1, 0 ; FLAT-NEXT: s_and_b64 s[4:5], s[0:1], exec -; FLAT-NEXT: .LBB1_2: ; %endif ; FLAT-NEXT: s_or_b64 exec, exec, s[6:7] -; FLAT-NEXT: .LBB1_3: ; %loop +; FLAT-NEXT: .LBB1_2: ; %loop ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_and_b64 s[0:1], exec, s[4:5] ; FLAT-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; FLAT-NEXT: s_andn2_b64 exec, exec, s[2:3] -; FLAT-NEXT: s_cbranch_execnz .LBB1_3 -; FLAT-NEXT: ; %bb.4: ; %exit +; FLAT-NEXT: s_andn2_b64 s[0:1], exec, s[2:3] +; FLAT-NEXT: s_and_b64 s[6:7], s[0:1], -1 +; FLAT-NEXT: s_cselect_b64 exec, s[0:1], s[2:3] +; FLAT-NEXT: s_cbranch_scc1 .LBB1_2 +; FLAT-NEXT: ; %bb.3: ; %exit ; FLAT-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll index a7b4eee84cb9e..dcee38b4f0f96 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll @@ -18,12 +18,13 @@ define amdgpu_ps i32 @if_else(i32 %0) !dbg !5 { ; OPT-NEXT: [[TMP8:%.*]] = extractvalue { i1, i64 } [[TMP6]], 1, !dbg [[DBG14]] ; OPT-NEXT: br i1 [[TMP7]], label [[TRUE:%.*]], label [[EXIT:%.*]], !dbg [[DBG14]] ; OPT: true: -; OPT-NEXT: br label [[EXIT]], !dbg [[DBG15:![0-9]+]] +; OPT-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP8]]), !dbg [[DBG15:![0-9]+]] +; OPT-NEXT: br label [[EXIT]], !dbg [[DBG15]] ; OPT: false: -; OPT-NEXT: br label [[FLOW]], !dbg [[DBG16:![0-9]+]] +; OPT-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP4]]), !dbg [[DBG16:![0-9]+]] +; OPT-NEXT: br label [[FLOW]], !dbg [[DBG16]] ; OPT: exit: ; OPT-NEXT: [[RET:%.*]] = phi i32 [ [[TMP5]], [[FLOW]] ], [ 42, [[TRUE]] ], !dbg [[DBG17:![0-9]+]] -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) ; OPT-NEXT: tail call void @llvm.dbg.value(metadata i32 [[RET]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17]] ; OPT-NEXT: ret i32 [[RET]], !dbg [[DBG18:![0-9]+]] ; @@ -61,16 +62,15 @@ define amdgpu_ps void @loop_if_break(i32 %n) !dbg !19 { ; OPT: loop_body: ; OPT-NEXT: [[I_NEXT:%.*]] = sub i32 [[I]], 1, !dbg [[DBG28:![0-9]+]] ; OPT-NEXT: tail call void @llvm.dbg.value(metadata i32 [[I_NEXT]], metadata [[META23:![0-9]+]], metadata !DIExpression()), !dbg [[DBG28]] -; OPT-NEXT: br label [[FLOW]], !dbg [[DBG29:![0-9]+]] +; OPT-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP2]]), !dbg [[DBG29:![0-9]+]] +; OPT-NEXT: br label [[FLOW]], !dbg [[DBG29]] ; OPT: Flow: ; OPT-NEXT: [[TMP3]] = phi i32 [ [[I_NEXT]], [[LOOP_BODY]] ], [ undef, [[LOOP]] ] ; OPT-NEXT: [[TMP4:%.*]] = phi i1 [ false, [[LOOP_BODY]] ], [ true, [[LOOP]] ] -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) ; OPT-NEXT: [[TMP5]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN]]), !dbg [[DBG27]] ; OPT-NEXT: [[TMP6:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP5]]), !dbg [[DBG27]] ; OPT-NEXT: br i1 [[TMP6]], label [[EXIT:%.*]], label [[LOOP]], !dbg [[DBG27]] ; OPT: exit: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP5]]) ; OPT-NEXT: ret void, !dbg [[DBG30:![0-9]+]] ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll index 13f8eff94f86b..00779c1bb109c 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=amdgcn-amd-amdhsa -p simplifycfg,amdgpu-unify-divergent-exit-nodes %s -S -o - | FileCheck %s --check-prefix=OPT ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA diff --git a/llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll b/llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll index 0edd9f4cd6b4f..b090fddfafe26 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s @@ -12,7 +13,7 @@ define amdgpu_kernel void @multiple_backedges(i32 %arg, ptr %arg1) { ; OPT-NEXT: [[TMP2:%.*]] = shl nsw i32 [[ARG:%.*]], 1 ; OPT-NEXT: br label [[LOOP:%.*]] ; OPT: loop: -; OPT-NEXT: [[PHI_BROKEN1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOOP_END:%.*]] ], [ [[PHI_BROKEN1]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; OPT-NEXT: [[PHI_BROKEN1:%.*]] = phi i64 [ [[TMP2]], [[LOOP_END:%.*]] ], [ [[PHI_BROKEN1]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] ; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ 0, [[LOOP_END]] ], [ [[TMP0:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] ; OPT-NEXT: [[TMP4:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP5:%.*]], [[LOOP]] ], [ 0, [[LOOP_END]] ] ; OPT-NEXT: [[TMP5]] = add nsw i32 [[TMP4]], [[TMP]] @@ -21,13 +22,11 @@ define amdgpu_kernel void @multiple_backedges(i32 %arg, ptr %arg1) { ; OPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]]) ; OPT-NEXT: br i1 [[TMP1]], label [[LOOP_END]], label [[LOOP]] ; OPT: loop_end: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]]) ; OPT-NEXT: [[EXIT:%.*]] = icmp sgt i32 [[TMP5]], [[TMP2]] -; OPT-NEXT: [[TMP7]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[EXIT]], i64 [[PHI_BROKEN1]]) -; OPT-NEXT: [[TMP3:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP7]]) +; OPT-NEXT: [[TMP2]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[EXIT]], i64 [[PHI_BROKEN1]]) +; OPT-NEXT: [[TMP3:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP2]]) ; OPT-NEXT: br i1 [[TMP3]], label [[LOOP_EXIT:%.*]], label [[LOOP]] ; OPT: loop_exit: -; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]]) ; OPT-NEXT: [[TMP12:%.*]] = zext i32 [[TMP]] to i64 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG1:%.*]], i64 [[TMP12]] ; OPT-NEXT: [[TMP14:%.*]] = addrspacecast ptr [[TMP13]] to ptr addrspace(1) diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir index a39fb827c06ff..036f250f3b3ef 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir @@ -1,3 +1,4 @@ +; XFAIL: * # RUN: llc -mtriple=amdgcn -run-pass si-fix-sgpr-copies %s -o - | FileCheck %s -check-prefixes=GCN --- @@ -22,15 +23,19 @@ body: | %7 = COPY $vgpr0 %8 = S_MOV_B32 0 - bb.1: + bb.1: %0 = PHI %8, %bb.0, %0, %bb.1, %2, %bb.2 %9 = V_MOV_B32_e32 9, implicit $exec %10 = V_CMP_EQ_U32_e64 %7, %9, implicit $exec - %1 = SI_IF %10, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + %1 = SI_IF %10, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec S_BRANCH %bb.1 + bb.3: + SI_WAVE_RECONVERGE %1, implicit-def $exec, implicit-def $scc, implicit $exec + S_BRANCH %bb.2 + + bb.2: - SI_END_CF %1, implicit-def $exec, implicit-def $scc, implicit $exec %11 = S_MOV_B32 1 %2 = S_ADD_I32 %0, %11, implicit-def $scc S_BRANCH %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll index 917743bf5d14c..f40112121ce78 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll +++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-kill.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}if_with_kill: diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll index 13745d4d5b171..4a0cf60a1004a 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll +++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator: diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir index eddad05d976bd..fc2e81ad29d42 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir +++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir @@ -28,12 +28,12 @@ body: | ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc - ; GCN-NEXT: dead [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], [[COPY]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 undef %1:sreg_64, $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) @@ -67,19 +67,17 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; GCN-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] - ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc + ; GCN-NEXT: dead [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: S_ENDPGM 0 @@ -97,7 +95,6 @@ body: | successors: %bb.2 %6:sreg_64_xexec = COPY %5 - SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec bb.2: S_ENDPGM 0 @@ -116,26 +113,19 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; GCN-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: dead [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: S_SLEEP 3 ; GCN-NEXT: S_NOP 0, implicit $vgpr0, implicit $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} @@ -157,7 +147,6 @@ body: | %6:sreg_64_xexec = COPY %5 S_NOP 0 - SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec S_SLEEP 3 S_NOP 0, implicit $vgpr0, implicit $sgpr4_sgpr5 @@ -178,25 +167,18 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; GCN-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11:0x0000000000000003 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] - ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10 + ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11:0x0000000000000003 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: dead [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] ; GCN-NEXT: S_SLEEP 3 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: {{ $}} @@ -219,7 +201,6 @@ body: | liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11:0x00000003 %6:sreg_64_xexec = COPY %5 - SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec S_SLEEP 3 S_NOP 0 @@ -241,23 +222,17 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; GCN-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] - ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: dead [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: $sgpr4_sgpr5 = S_MOV_B64 32 ; GCN-NEXT: {{ $}} @@ -279,7 +254,6 @@ body: | successors: %bb.2 %6:sreg_64_xexec = COPY %5 - SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec $sgpr4_sgpr5 = S_MOV_B64 32 @@ -301,26 +275,19 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[V_CMP_EQ_U32_e64_]], implicit $scc ; GCN-NEXT: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] - ; GCN-NEXT: $sgpr4_sgpr5 = S_MOV_B64 32 - ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: dead [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]] + ; GCN-NEXT: $sgpr4_sgpr5 = S_MOV_B64 32 ; GCN-NEXT: S_SLEEP 3, implicit $sgpr4_sgpr5 ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: {{ $}} @@ -344,7 +311,6 @@ body: | %6:sreg_64_xexec = COPY %5 $sgpr4_sgpr5 = S_MOV_B64 32 - SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec S_SLEEP 3, implicit $sgpr4_sgpr5 S_NOP 0 @@ -371,20 +337,15 @@ body: | ; GCN-NEXT: dead [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[V_CMP_EQ_U32_e64_]] - ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY3]], implicit-def $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc - ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY4]], implicit-def dead $scc - ; GCN-NEXT: $exec = S_MOV_B64_term [[S_AND_B64_]] + ; GCN-NEXT: dead [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[V_CMP_EQ_U32_e64_]] + ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_1]], $exec, implicit-def $scc + ; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], $exec, implicit-def $scc + ; GCN-NEXT: dead [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64_term [[S_AND_B64_]], implicit $scc ; GCN-NEXT: dead [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: S_ENDPGM 0 @@ -402,7 +363,6 @@ body: | successors: %bb.2 %6:sreg_64_xexec = COPY %3 - SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec %7:sreg_64_xexec = SI_IF %4, %bb.2, implicit-def $exec, implicit-def dead $scc, implicit $exec %8:sreg_64_xexec = S_MOV_B64_term %7, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir index ecbd47a9e8d0d..da57e211d8871 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir +++ b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir @@ -61,12 +61,12 @@ body: | ; GCN-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc ; GCN-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_EQ_U32_e64_]], $exec_lo, implicit-def $scc ; GCN-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_1]], implicit-def $scc + ; GCN-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.1, [[S_OR_B32_1]], %bb.2 - ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4 ; GCN-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[PHI3]], killed [[S_MOV_B64_]], implicit-def dead $vcc_lo, implicit $exec ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1 @@ -124,12 +124,12 @@ body: | %24:sreg_32 = S_MOV_B32 0 %25:sreg_32 = V_CMP_EQ_U32_e64 killed %23, killed %24, implicit $exec %26:vreg_1 = COPY %25 + SI_WAVE_RECONVERGE %22, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: successors: %bb.4(0x04000000), %bb.1(0x7c000000) %20:vreg_1 = PHI %26, %bb.2, %19, %bb.1 ;%20:vreg_1 = PHI %19, %bb.1, %26, %bb.2 - this is original phi created by SDAG - SI_END_CF %22, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %27:sreg_64 = S_MOV_B64 4 %18:vreg_64 = V_ADD_U64_PSEUDO %17, killed %27, implicit-def dead $vcc, implicit $exec %28:sreg_32 = S_MOV_B32 1 diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir index 9312322c04afe..b37086e0760ea 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir +++ b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir @@ -23,11 +23,10 @@ body: | bb.2: %6:vreg_1 = PHI %5, %bb.1 - SI_END_CF %3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + SI_WAVE_RECONVERGE %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: %7:vreg_1 = PHI %6, %bb.2, %8, %bb.0 - SI_END_CF %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir b/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir index f234ea24a9fe7..df933174e0d5c 100644 --- a/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir +++ b/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir @@ -55,10 +55,10 @@ body: | ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[PHI1]], %subreg.sub0, [[PHI1]], %subreg.sub1, [[PHI1]], %subreg.sub2, undef %6:vgpr_32, %subreg.sub3 + ; CHECK-NEXT: SI_WAVE_RECONVERGE killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vreg_128 = PHI [[PHI]], %bb.2, [[REG_SEQUENCE1]], %bb.3 - ; CHECK-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: dead [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[PHI2]].sub2, %subreg.sub0, [[PHI2]].sub2, %subreg.sub1, [[PHI2]].sub2, %subreg.sub2, undef [[BUFFER_LOAD_DWORD_OFFEN]], %subreg.sub3 ; CHECK-NEXT: S_ENDPGM 0 bb.0: @@ -88,10 +88,10 @@ body: | successors: %bb.8(0x80000000) %12:vreg_128 = REG_SEQUENCE %3, %subreg.sub0, %3, %subreg.sub1, killed %3, %subreg.sub2, undef %7, %subreg.sub3 + SI_WAVE_RECONVERGE killed %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.8: %13:vreg_128 = PHI %10, %bb.6, %12, %bb.7 - SI_END_CF killed %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec %5:vreg_128 = REG_SEQUENCE %13.sub2, %subreg.sub0, %13.sub2, %subreg.sub1, killed %13.sub2, %subreg.sub2, undef %3, %subreg.sub3 S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll index d34769ad0fcf0..bcbc4a933538c 100644 --- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll +++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll @@ -16,30 +16,30 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; GCN-NEXT: s_cbranch_execnz .LBB0_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB0_4 -; GCN-NEXT: .LBB0_2: ; %bb3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB0_3: ; %bb2 +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GCN-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-NEXT: s_cbranch_scc0 .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %bb2 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: v_mov_b32_e32 v4, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: .LBB0_4: ; %bb1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: .LBB0_2: ; %Flow +; GCN-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GCN-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 +; GCN-NEXT: ; %bb.3: ; %bb1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: v_mov_b32_e32 v4, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[1:2], v[3:4] -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: .LBB0_4: ; %bb3 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.mir b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.mir index 3bdcc14936fb9..113e667758310 100644 --- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.mir +++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.mir @@ -1,3 +1,4 @@ +; XFAIL: * # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -run-pass=si-opt-vgpr-liverange %s -o - | FileCheck -check-prefix=GCN %s # SIOptimizeVGPRLiveRange shouldn't try to modify use of %5 in DBG_VALUE_LIST @@ -94,6 +95,7 @@ body: | %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %9:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %8, %subreg.sub1 FLAT_STORE_DWORDX2 %5, killed %9, 0, 0, implicit $exec, implicit $flat_scr + SI_WAVE_RECONVERGE %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.4 bb.3: @@ -105,7 +107,6 @@ body: | S_BRANCH %bb.1 bb.4: - SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec DBG_VALUE_LIST !4, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_constu, 2712, DW_OP_mul, DW_OP_plus, DW_OP_plus_uconst, 2680, DW_OP_stack_value), %5, 0, debug-location !9 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index f9a17783f0d35..3b17099c6871b 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -8,53 +8,22 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; This used to bypass the structurization process because structurizer is unable to ; handle multiple-exits CFG. This should be correctly structurized. -; UNIFY-LABEL: define amdgpu_kernel void @kernel -; UNIFY-LABEL: entry: -; UNIFY: %tid = call i32 @llvm.amdgcn.workitem.id.x() -; UNIFY-NEXT: %cmp = icmp eq i32 %n.load, 256 -; UNIFY-NEXT: br i1 %cmp, label %if.then, label %if.else -; UNIFY-LABEL: if.then: -; UNIFY-NEXT: %cmp1 = icmp eq i32 %a.load, 0 -; UNIFY-NEXT: br i1 %cmp1, label %if.end6.sink.split, label %cond.false -; UNIFY-LABEL: cond.false: -; UNIFY-NEXT: call void @llvm.trap() -; UNIFY-NEXT: br label %UnifiedUnreachableBlock -; UNIFY-LABEL: if.else: -; UNIFY-NEXT: %cmp2 = icmp ult i32 %tid, 10 -; UNIFY-NEXT: br i1 %cmp2, label %if.then3, label %UnifiedReturnBlock -; UNIFY-LABEL: if.then3: -; UNIFY-NEXT: %cmp1.i7 = icmp eq i32 %a.load, 0 -; UNIFY-NEXT: br i1 %cmp1.i7, label %if.end6.sink.split, label %cond.false.i8 -; UNIFY-LABEL: cond.false.i8: -; UNIFY-NEXT: call void @llvm.trap() -; UNIFY-NEXT: br label %UnifiedUnreachableBlock -; UNIFY-LABEL: if.end6.sink.split: -; UNIFY-NEXT: %x.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %kernel.kernarg.segment, i64 8 -; UNIFY-NEXT: %x.load = load ptr addrspace(1), ptr addrspace(4) %x.kernarg.offset, align 8, !invariant.load !0 -; UNIFY-NEXT: %idxprom = sext i32 %tid to i64 -; UNIFY-NEXT: %x1 = getelementptr inbounds i32, ptr addrspace(1) %x.load, i64 %idxprom -; UNIFY-NEXT: store i32 %a.load, ptr addrspace(1) %x1, align 4 -; UNIFY-NEXT: br label %UnifiedReturnBlock -; UNIFY-LABEL: UnifiedUnreachableBlock: -; UNIFY-NEXT: call void @llvm.amdgcn.unreachable() -; UNIFY-NEXT: br label %UnifiedReturnBlock -; UNIFY-LABEL: UnifiedReturnBlock: -; UNIFY-NEXT: ret void - ; CHECK-LABEL: kernel: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x10 ; CHECK-NEXT: s_load_dword s10, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmpk_lg_i32 s0, 0x100 -; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_5 ; CHECK-NEXT: ; %bb.1: ; %if.else ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0 +; CHECK-NEXT: s_xor_b64 s[8:9], vcc, exec +; CHECK-NEXT: s_and_b64 s[0:1], vcc, -1 ; CHECK-NEXT: s_mov_b64 s[6:7], 0 ; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc -; CHECK-NEXT: s_cbranch_execz .LBB0_5 +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 ; CHECK-NEXT: ; %bb.2: ; %if.then3 ; CHECK-NEXT: s_cmp_lg_u32 s10, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_14 @@ -63,27 +32,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: .LBB0_4: ; %Flow3 ; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CHECK-NEXT: s_and_b64 s[2:3], s[2:3], exec -; CHECK-NEXT: .LBB0_5: ; %Flow2 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] -; CHECK-NEXT: s_cbranch_vccz .LBB0_8 -; CHECK-NEXT: s_branch .LBB0_7 -; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: s_branch .LBB0_6 +; CHECK-NEXT: .LBB0_5: +; CHECK-NEXT: s_mov_b64 s[6:7], -1 ; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_execz .LBB0_8 -; CHECK-NEXT: .LBB0_7: ; %if.then +; CHECK-NEXT: .LBB0_6: ; %Flow +; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] +; CHECK-NEXT: s_cbranch_vccz .LBB0_8 +; CHECK-NEXT: ; %bb.7: ; %if.then ; CHECK-NEXT: s_cmp_lg_u32 s10, 0 ; CHECK-NEXT: s_mov_b64 s[0:1], -1 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_13 ; CHECK-NEXT: .LBB0_8: ; %Flow4 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] -; CHECK-NEXT: .LBB0_9: ; %UnifiedUnreachableBlock +; CHECK-NEXT: s_and_b64 s[2:3], s[2:3], exec +; CHECK-NEXT: s_mov_b64 s[6:7], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[2:3], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[2:3] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_10 +; CHECK-NEXT: ; %bb.9: ; %UnifiedUnreachableBlock ; CHECK-NEXT: ; divergent unreachable -; CHECK-NEXT: .LBB0_10: ; %Flow6 ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; CHECK-NEXT: s_cbranch_execz .LBB0_12 +; CHECK-NEXT: .LBB0_10: ; %Flow6 +; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[0:1] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; %if.end6.sink.split ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -96,13 +71,12 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], exec ; CHECK-NEXT: s_trap 2 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] -; CHECK-NEXT: s_cbranch_execnz .LBB0_9 -; CHECK-NEXT: s_branch .LBB0_10 +; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_14: ; %cond.false.i8 ; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 + entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %n, 256 @@ -136,3 +110,5 @@ if.end6.sink.split: if.end6: ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; UNIFY: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll index 1eef7b967f6d9..fe037d0ca9ad7 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll @@ -1,5 +1,5 @@ +; XFAIL: * ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -verify -S %s -o - | FileCheck -check-prefix=IR %s @@ -47,6 +47,7 @@ define void @my_func(i32 %0) { ; IR-NEXT: br i1 [[TMP12]], label [[LEAFBLOCK5:%.*]], label [[FLOW13:%.*]] ; IR: LeafBlock5: ; IR-NEXT: [[SWITCHLEAF6:%.*]] = icmp eq i32 [[TMP0]], 2 +; IR-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP13]]) ; IR-NEXT: br label [[FLOW13]] ; IR: Flow13: ; IR-NEXT: [[TMP14:%.*]] = phi i1 [ true, [[LEAFBLOCK5]] ], [ false, [[NODEBLOCK7]] ] @@ -58,11 +59,11 @@ define void @my_func(i32 %0) { ; IR: LeafBlock3: ; IR-NEXT: [[SWITCHLEAF4:%.*]] = icmp eq i32 [[TMP0]], 0 ; IR-NEXT: [[SWITCHLEAF4_INV:%.*]] = xor i1 [[SWITCHLEAF4]], true +; IR-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP18]]) ; IR-NEXT: br label [[FLOW14]] ; IR: Flow14: ; IR-NEXT: [[TMP19:%.*]] = phi i1 [ [[SWITCHLEAF4_INV]], [[LEAFBLOCK3]] ], [ [[TMP14]], [[FLOW13]] ] ; IR-NEXT: [[TMP20:%.*]] = phi i1 [ [[SWITCHLEAF4]], [[LEAFBLOCK3]] ], [ [[TMP15]], [[FLOW13]] ] -; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP18]]) ; IR-NEXT: [[TMP21:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP20]]) ; IR-NEXT: [[TMP22:%.*]] = extractvalue { i1, i64 } [[TMP21]], 0 ; IR-NEXT: [[TMP23:%.*]] = extractvalue { i1, i64 } [[TMP21]], 1 @@ -72,7 +73,6 @@ define void @my_func(i32 %0) { ; IR: Flow15: ; IR-NEXT: [[TMP24]] = phi i1 [ [[TMP29:%.*]], [[FLOW16:%.*]] ], [ false, [[FLOW14]] ] ; IR-NEXT: [[TMP25]] = phi i1 [ [[TMP30:%.*]], [[FLOW16]] ], [ [[TMP19]], [[FLOW14]] ] -; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP23]]) ; IR-NEXT: br label [[FLOW12]] ; IR: LeafBlock9: ; IR-NEXT: [[SWITCHLEAF10:%.*]] = icmp sgt i32 [[TMP0]], 1 @@ -82,27 +82,28 @@ define void @my_func(i32 %0) { ; IR-NEXT: br i1 [[TMP27]], label [[DO_BODY_I_I_I_I:%.*]], label [[FLOW16]] ; IR: do.body.i.i.i.i: ; IR-NEXT: tail call fastcc void null() +; IR-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP28]]) ; IR-NEXT: br label [[FLOW16]] ; IR: Flow16: ; IR-NEXT: [[TMP29]] = phi i1 [ true, [[DO_BODY_I_I_I_I]] ], [ false, [[LEAFBLOCK9]] ] ; IR-NEXT: [[TMP30]] = phi i1 [ false, [[DO_BODY_I_I_I_I]] ], [ true, [[LEAFBLOCK9]] ] -; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP28]]) +; IR-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP23]]) ; IR-NEXT: br label [[FLOW15]] ; IR: do.body: ; IR-NEXT: tail call fastcc void null() +; IR-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP8]]) ; IR-NEXT: br label [[FLOW17]] ; IR: Flow17: ; IR-NEXT: [[TMP31:%.*]] = phi i1 [ true, [[DO_BODY]] ], [ [[TMP4]], [[FLOW11]] ] -; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) ; IR-NEXT: [[TMP32:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP31]]) ; IR-NEXT: [[TMP33:%.*]] = extractvalue { i1, i64 } [[TMP32]], 0 ; IR-NEXT: [[TMP34:%.*]] = extractvalue { i1, i64 } [[TMP32]], 1 ; IR-NEXT: br i1 [[TMP33]], label [[UNIFIEDUNREACHABLEBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] ; IR: UnifiedUnreachableBlock: ; IR-NEXT: call void @llvm.amdgcn.unreachable() +; IR-NEXT: call void @llvm.amdgcn.wave.reconverge.i64(i64 [[TMP34]]) ; IR-NEXT: br label [[UNIFIEDRETURNBLOCK]] ; IR: UnifiedReturnBlock: -; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP34]]) ; IR-NEXT: ret void ; ; GCN-LABEL: my_func: diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 2c0f64f85d823..bea224b3b530a 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -296,7 +296,7 @@ entry: ; GCN: v_readfirstlane_b32 ; GCN: s_and_saveexec_b64 ; GCN: s_swappc_b64 -; GCN: s_cbranch_execnz +; GCN: s_cbranch_scc1 ; GCN: s_setpc_b64 define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr %func.ptr, i32 %a, i32 %b, i32 %c) #1 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll b/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll index 6f768641b5b03..c05835dcdd8e1 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; FIXME: merge with trap.ll diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index d19ef75cb08cd..c0f98946b4161 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -936,11 +936,12 @@ exit: define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; SI-LABEL: test_kill_divergent_loop: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB10_4 +; SI-NEXT: s_xor_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_and_b64 s[2:3], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB10_5 ; SI-NEXT: ; %bb.1: ; %bb.preheader ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -961,7 +962,7 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; SI-NEXT: ;;#ASMEND ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc -; SI-NEXT: s_cbranch_scc0 .LBB10_5 +; SI-NEXT: s_cbranch_scc0 .LBB10_6 ; SI-NEXT: ; %bb.3: ; %bb ; SI-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; SI-NEXT: s_andn2_b64 exec, exec, vcc @@ -969,15 +970,16 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: s_cbranch_vccnz .LBB10_2 -; SI-NEXT: .LBB10_4: ; %Flow1 +; SI-NEXT: ; %bb.4: ; %Flow ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: .LBB10_5: ; %exit ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm -; SI-NEXT: .LBB10_5: +; SI-NEXT: .LBB10_6: ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: exp null off, off, off, off done vm ; SI-NEXT: s_endpgm @@ -986,9 +988,10 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE64: ; %bb.0: ; %entry ; GFX10-WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB10_3 +; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10-WAVE64-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX10-WAVE64-NEXT: .LBB10_1: ; %bb ; GFX10-WAVE64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-WAVE64-NEXT: ;;#ASMSTART @@ -1006,7 +1009,7 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE64-NEXT: ;;#ASMEND ; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 ; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc -; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB10_4 +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb ; GFX10-WAVE64-NEXT: ; in Loop: Header=BB10_1 Depth=1 ; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc @@ -1014,13 +1017,14 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10-WAVE64-NEXT: s_cbranch_vccnz .LBB10_1 -; GFX10-WAVE64-NEXT: .LBB10_3: ; %Flow1 +; GFX10-WAVE64-NEXT: ; %bb.3: ; %Flow ; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-WAVE64-NEXT: .LBB10_4: ; %exit ; GFX10-WAVE64-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-WAVE64-NEXT: global_store_dword v[0:1], v0, off ; GFX10-WAVE64-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WAVE64-NEXT: s_endpgm -; GFX10-WAVE64-NEXT: .LBB10_4: +; GFX10-WAVE64-NEXT: .LBB10_5: ; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 ; GFX10-WAVE64-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE64-NEXT: s_endpgm @@ -1029,9 +1033,10 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE32: ; %bb.0: ; %entry ; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB10_3 +; GFX10-WAVE32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10-WAVE32-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX10-WAVE32-NEXT: .LBB10_1: ; %bb ; GFX10-WAVE32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-WAVE32-NEXT: ;;#ASMSTART @@ -1049,7 +1054,7 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE32-NEXT: ;;#ASMEND ; GFX10-WAVE32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v7 ; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, vcc_lo -; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB10_4 +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb ; GFX10-WAVE32-NEXT: ; in Loop: Header=BB10_1 Depth=1 ; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo @@ -1057,24 +1062,26 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-WAVE32-NEXT: s_cbranch_vccnz .LBB10_1 -; GFX10-WAVE32-NEXT: .LBB10_3: ; %Flow1 +; GFX10-WAVE32-NEXT: ; %bb.3: ; %Flow ; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-WAVE32-NEXT: .LBB10_4: ; %exit ; GFX10-WAVE32-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-WAVE32-NEXT: global_store_dword v[0:1], v0, off ; GFX10-WAVE32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WAVE32-NEXT: s_endpgm -; GFX10-WAVE32-NEXT: .LBB10_4: +; GFX10-WAVE32-NEXT: .LBB10_5: ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm ; ; GFX11-LABEL: test_kill_divergent_loop: ; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11-NEXT: s_mov_b64 s[0:1], exec -; GFX11-NEXT: s_mov_b64 s[2:3], exec -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11-NEXT: s_cbranch_execz .LBB10_3 +; GFX11-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX11-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX11-NEXT: .LBB10_1: ; %bb ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: ;;#ASMSTART @@ -1092,7 +1099,7 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 ; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc -; GFX11-NEXT: s_cbranch_scc0 .LBB10_4 +; GFX11-NEXT: s_cbranch_scc0 .LBB10_5 ; GFX11-NEXT: ; %bb.2: ; %bb ; GFX11-NEXT: ; in Loop: Header=BB10_1 Depth=1 ; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc @@ -1100,15 +1107,16 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11-NEXT: s_cbranch_vccnz .LBB10_1 -; GFX11-NEXT: .LBB10_3: ; %Flow1 +; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: .LBB10_4: ; %exit ; GFX11-NEXT: v_mov_b32_e32 v0, 8 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm -; GFX11-NEXT: .LBB10_4: +; GFX11-NEXT: .LBB10_5: ; GFX11-NEXT: s_mov_b64 exec, 0 ; GFX11-NEXT: exp mrt0 off, off, off, off done ; GFX11-NEXT: s_endpgm @@ -1402,22 +1410,24 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB13_3 +; SI-NEXT: s_xor_b64 s[2:3], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB13_3 ; SI-NEXT: ; %bb.1: ; %bb3 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc ; SI-NEXT: s_cbranch_scc0 .LBB13_6 ; SI-NEXT: ; %bb.2: ; %bb3 ; SI-NEXT: s_andn2_b64 exec, exec, vcc -; SI-NEXT: .LBB13_3: ; %bb4 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB13_3: ; %bb4 ; SI-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB13_5 +; SI-NEXT: s_and_b64 s[0:1], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB13_5 ; SI-NEXT: ; %bb.4: ; %bb8 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1436,22 +1446,24 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-WAVE64-NEXT: s_wqm_b64 exec, exec ; GFX10-WAVE64-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB13_3 +; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX10-WAVE64-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc ; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc -; GFX10-WAVE64-NEXT: .LBB13_3: ; %bb4 ; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-WAVE64-NEXT: .LBB13_3: ; %bb4 ; GFX10-WAVE64-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB13_5 +; GFX10-WAVE64-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB13_5 ; GFX10-WAVE64-NEXT: ; %bb.4: ; %bb8 ; GFX10-WAVE64-NEXT: v_mov_b32_e32 v0, 9 ; GFX10-WAVE64-NEXT: global_store_dword v[0:1], v0, off @@ -1468,22 +1480,24 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-WAVE32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-WAVE32-NEXT: v_cmp_nle_f32_e32 vcc_lo, 0, v1 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB13_3 +; GFX10-WAVE32-NEXT: s_xor_b32 s1, vcc_lo, exec_lo +; GFX10-WAVE32-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 ; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, vcc_lo ; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo -; GFX10-WAVE32-NEXT: .LBB13_3: ; %bb4 ; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-WAVE32-NEXT: .LBB13_3: ; %bb4 ; GFX10-WAVE32-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB13_5 +; GFX10-WAVE32-NEXT: s_and_b32 s0, vcc_lo, -1 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB13_5 ; GFX10-WAVE32-NEXT: ; %bb.4: ; %bb8 ; GFX10-WAVE32-NEXT: v_mov_b32_e32 v0, 9 ; GFX10-WAVE32-NEXT: global_store_dword v[0:1], v0, off @@ -1499,25 +1513,26 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_mov_b64 s[0:1], exec ; GFX11-NEXT: s_wqm_b64 exec, exec -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b64 s[2:3], exec -; GFX11-NEXT: v_cmpx_nle_f32_e32 0, v1 -; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11-NEXT: s_cbranch_execz .LBB13_3 +; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 +; GFX11-NEXT: s_xor_b64 s[2:3], vcc, exec +; GFX11-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB13_3 ; GFX11-NEXT: ; %bb.1: ; %bb3 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc ; GFX11-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX11-NEXT: ; %bb.2: ; %bb3 ; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc -; GFX11-NEXT: .LBB13_3: ; %bb4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: .LBB13_3: ; %bb4 ; GFX11-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D -; GFX11-NEXT: s_mov_b64 s[0:1], exec ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmpx_neq_f32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB13_5 +; GFX11-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX11-NEXT: s_and_b64 s[0:1], vcc, -1 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB13_5 ; GFX11-NEXT: ; %bb.4: ; %bb8 ; GFX11-NEXT: v_mov_b32_e32 v0, 9 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc @@ -1554,31 +1569,35 @@ bb9: ; preds = %bb4 define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; SI-LABEL: cbranch_kill: ; SI: ; %bb.0: ; %.entry -; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: image_sample_l v1, v[1:4], s[0:7], s[0:3] dmask:0x1 da ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_cbranch_execz .LBB14_3 +; SI-NEXT: s_xor_b64 s[0:1], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB14_3 ; SI-NEXT: ; %bb.1: ; %kill -; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; SI-NEXT: s_cbranch_scc0 .LBB14_6 ; SI-NEXT: ; %bb.2: ; %kill ; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: .LBB14_3: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] +; SI-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; SI-NEXT: s_and_b64 s[4:5], s[0:1], -1 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_xor_b64 exec, exec, s[0:1] +; SI-NEXT: s_cmov_b64 exec, s[0:1] +; SI-NEXT: s_cbranch_scc0 .LBB14_5 ; SI-NEXT: ; %bb.4: ; %live ; SI-NEXT: v_mul_f32_e32 v2, v0, v1 -; SI-NEXT: ; %bb.5: ; %export -; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB14_5: ; %export ; SI-NEXT: exp mrt0 v2, v2, v2, v2 done vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB14_6: @@ -1589,28 +1608,32 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX10-WAVE64-LABEL: cbranch_kill: ; GFX10-WAVE64: ; %bb.0: ; %.entry ; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec ; GFX10-WAVE64-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE64-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB14_3 +; GFX10-WAVE64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX10-WAVE64-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %kill -; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr0 -; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr1 +; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB14_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %kill ; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 +; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr0 +; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr1 +; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-WAVE64-NEXT: .LBB14_3: ; %Flow -; GFX10-WAVE64-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] +; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GFX10-WAVE64-NEXT: s_and_b64 s[4:5], s[0:1], -1 ; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr2 -; GFX10-WAVE64-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, s[0:1] +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX10-WAVE64-NEXT: ; %bb.4: ; %live ; GFX10-WAVE64-NEXT: v_mul_f32_e32 v2, v0, v1 -; GFX10-WAVE64-NEXT: ; %bb.5: ; %export -; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-WAVE64-NEXT: .LBB14_5: ; %export ; GFX10-WAVE64-NEXT: exp mrt0 v2, v2, v2, v2 done vm ; GFX10-WAVE64-NEXT: s_endpgm ; GFX10-WAVE64-NEXT: .LBB14_6: @@ -1621,28 +1644,32 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX10-WAVE32-LABEL: cbranch_kill: ; GFX10-WAVE32: ; %bb.0: ; %.entry ; GFX10-WAVE32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-WAVE32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-WAVE32-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_ge_f32_e32 vcc_lo, 0, v1 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB14_3 +; GFX10-WAVE32-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX10-WAVE32-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %kill -; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, exec_lo -; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr0 -; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr1 +; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB14_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %kill ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr0 +; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr1 +; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-WAVE32-NEXT: .LBB14_3: ; %Flow -; GFX10-WAVE32-NEXT: s_or_saveexec_b32 s0, s1 +; GFX10-WAVE32-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-WAVE32-NEXT: s_and_b32 s2, s0, -1 ; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr2 -; GFX10-WAVE32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX10-WAVE32-NEXT: ; %bb.4: ; %live ; GFX10-WAVE32-NEXT: v_mul_f32_e32 v2, v0, v1 -; GFX10-WAVE32-NEXT: ; %bb.5: ; %export -; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-WAVE32-NEXT: .LBB14_5: ; %export ; GFX10-WAVE32-NEXT: exp mrt0 v2, v2, v2, v2 done vm ; GFX10-WAVE32-NEXT: s_endpgm ; GFX10-WAVE32-NEXT: .LBB14_6: @@ -1653,29 +1680,34 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX11-LABEL: cbranch_kill: ; GFX11: ; %bb.0: ; %.entry ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_mov_b64 s[0:1], exec -; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmpx_ge_f32_e32 0, v1 -; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3] -; GFX11-NEXT: s_cbranch_execz .LBB14_3 +; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 +; GFX11-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX11-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB14_3 ; GFX11-NEXT: ; %bb.1: ; %kill -; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], exec -; GFX11-NEXT: ; implicit-def: $vgpr0 -; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], exec ; GFX11-NEXT: s_cbranch_scc0 .LBB14_6 ; GFX11-NEXT: ; %bb.2: ; %kill ; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: .LBB14_3: ; %Flow -; GFX11-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] -; GFX11-NEXT: ; implicit-def: $vgpr2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX11-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GFX11-NEXT: s_and_b64 s[4:5], s[0:1], -1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: s_cmov_b64 exec, s[0:1] +; GFX11-NEXT: s_cbranch_scc0 .LBB14_5 ; GFX11-NEXT: ; %bb.4: ; %live ; GFX11-NEXT: v_mul_f32_e32 v2, v0, v1 -; GFX11-NEXT: ; %bb.5: ; %export -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: .LBB14_5: ; %export ; GFX11-NEXT: exp mrt0 v2, v2, v2, v2 done ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB14_6: @@ -1714,19 +1746,21 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: ; %latch ; SI-NEXT: ; in Loop: Header=BB15_3 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_add_i32 s6, s6, 1 ; SI-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; SI-NEXT: s_and_b64 s[8:9], s[4:5], -1 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execz .LBB15_6 +; SI-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; SI-NEXT: s_cbranch_scc0 .LBB15_6 ; SI-NEXT: .LBB15_3: ; %hdr ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB15_2 +; SI-NEXT: s_xor_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[8:9], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB15_2 ; SI-NEXT: ; %bb.4: ; %kill ; SI-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec @@ -1734,9 +1768,9 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; SI-NEXT: ; %bb.5: ; %kill ; SI-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_branch .LBB15_2 -; SI-NEXT: .LBB15_6: ; %Flow -; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: .LBB15_6: ; %._crit_edge ; SI-NEXT: exp mrt0 v2, v2, v0, v0 done vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB15_7: @@ -1759,19 +1793,21 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX10-WAVE64-NEXT: s_branch .LBB15_3 ; GFX10-WAVE64-NEXT: .LBB15_2: ; %latch ; GFX10-WAVE64-NEXT: ; in Loop: Header=BB15_3 Depth=1 -; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-WAVE64-NEXT: s_add_i32 s6, s6, 1 ; GFX10-WAVE64-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-WAVE64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB15_6 +; GFX10-WAVE64-NEXT: s_andn2_b64 s[4:5], exec, s[0:1] +; GFX10-WAVE64-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX10-WAVE64-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB15_6 ; GFX10-WAVE64-NEXT: .LBB15_3: ; %hdr ; GFX10-WAVE64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-WAVE64-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB15_2 +; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX10-WAVE64-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX10-WAVE64-NEXT: s_cmov_b64 exec, vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX10-WAVE64-NEXT: ; %bb.4: ; %kill ; GFX10-WAVE64-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec @@ -1779,9 +1815,9 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX10-WAVE64-NEXT: ; %bb.5: ; %kill ; GFX10-WAVE64-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 +; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-WAVE64-NEXT: s_branch .LBB15_2 -; GFX10-WAVE64-NEXT: .LBB15_6: ; %Flow -; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10-WAVE64-NEXT: .LBB15_6: ; %._crit_edge ; GFX10-WAVE64-NEXT: exp mrt0 v2, v2, v0, v0 done vm ; GFX10-WAVE64-NEXT: s_endpgm ; GFX10-WAVE64-NEXT: .LBB15_7: @@ -1804,19 +1840,21 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX10-WAVE32-NEXT: s_branch .LBB15_3 ; GFX10-WAVE32-NEXT: .LBB15_2: ; %latch ; GFX10-WAVE32-NEXT: ; in Loop: Header=BB15_3 Depth=1 -; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-WAVE32-NEXT: s_add_i32 s2, s2, 1 ; GFX10-WAVE32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 ; GFX10-WAVE32-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WAVE32-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB15_6 +; GFX10-WAVE32-NEXT: s_andn2_b32 s3, exec_lo, s0 +; GFX10-WAVE32-NEXT: s_and_b32 s4, s3, -1 +; GFX10-WAVE32-NEXT: s_cselect_b32 exec_lo, s3, s0 +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB15_6 ; GFX10-WAVE32-NEXT: .LBB15_3: ; %hdr ; GFX10-WAVE32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-WAVE32-NEXT: v_cmp_gt_u32_e32 vcc_lo, s2, v0 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX10-WAVE32-NEXT: s_xor_b32 s3, exec_lo, s3 -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB15_2 +; GFX10-WAVE32-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GFX10-WAVE32-NEXT: s_and_b32 s4, vcc_lo, -1 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX10-WAVE32-NEXT: ; %bb.4: ; %kill ; GFX10-WAVE32-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, exec_lo @@ -1824,9 +1862,9 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX10-WAVE32-NEXT: ; %bb.5: ; %kill ; GFX10-WAVE32-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-WAVE32-NEXT: s_branch .LBB15_2 -; GFX10-WAVE32-NEXT: .LBB15_6: ; %Flow -; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-WAVE32-NEXT: .LBB15_6: ; %._crit_edge ; GFX10-WAVE32-NEXT: exp mrt0 v2, v2, v0, v0 done vm ; GFX10-WAVE32-NEXT: s_endpgm ; GFX10-WAVE32-NEXT: .LBB15_7: @@ -1847,22 +1885,26 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_branch .LBB15_3 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB15_2: ; %latch ; GFX11-NEXT: ; in Loop: Header=BB15_3 Depth=1 -; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11-NEXT: s_add_i32 s6, s6, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] -; GFX11-NEXT: s_cbranch_execz .LBB15_6 +; GFX11-NEXT: s_and_not1_b64 s[4:5], exec, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX11-NEXT: s_cselect_b64 exec, s[4:5], s[0:1] +; GFX11-NEXT: s_cbranch_scc0 .LBB15_6 ; GFX11-NEXT: .LBB15_3: ; %hdr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_mov_b64 s[4:5], exec -; GFX11-NEXT: v_cmpx_gt_u32_e64 s6, v0 -; GFX11-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 +; GFX11-NEXT: s_xor_b64 s[4:5], vcc, exec +; GFX11-NEXT: s_and_b64 s[8:9], vcc, -1 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX11-NEXT: ; %bb.4: ; %kill ; GFX11-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], exec @@ -1870,9 +1912,10 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { ; GFX11-NEXT: ; %bb.5: ; %kill ; GFX11-NEXT: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11-NEXT: s_branch .LBB15_2 -; GFX11-NEXT: .LBB15_6: ; %Flow -; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11-NEXT: .LBB15_6: ; %._crit_edge ; GFX11-NEXT: exp mrt0 v2, v2, v0, v0 done ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB15_7: @@ -1916,36 +1959,42 @@ define void @skip_mode_switch(i32 %arg) { ; WAVE64: ; %bb.0: ; %entry ; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; WAVE64-NEXT: s_cbranch_execz .LBB16_2 +; WAVE64-NEXT: s_mov_b64 s[4:5], exec +; WAVE64-NEXT: s_and_b64 s[6:7], vcc, -1 +; WAVE64-NEXT: s_cmov_b64 exec, vcc +; WAVE64-NEXT: s_cbranch_scc0 .LBB16_2 ; WAVE64-NEXT: ; %bb.1: ; %bb.0 ; WAVE64-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 -; WAVE64-NEXT: .LBB16_2: ; %bb.1 ; WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] +; WAVE64-NEXT: .LBB16_2: ; %bb.1 ; WAVE64-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-WAVE32-LABEL: skip_mode_switch: ; GFX10-WAVE32: ; %bb.0: ; %entry ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB16_2 +; GFX10-WAVE32-NEXT: s_mov_b32 s4, exec_lo +; GFX10-WAVE32-NEXT: s_and_b32 s5, vcc_lo, -1 +; GFX10-WAVE32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb.0 ; GFX10-WAVE32-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 -; GFX10-WAVE32-NEXT: .LBB16_2: ; %bb.1 ; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-WAVE32-NEXT: .LBB16_2: ; %bb.1 ; GFX10-WAVE32-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: skip_mode_switch: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11-NEXT: s_mov_b64 s[0:1], exec -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX11-NEXT: s_cmov_b64 exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX11-NEXT: ; %bb.1: ; %bb.0 ; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 -; GFX11-NEXT: .LBB16_2: ; %bb.1 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11-NEXT: .LBB16_2: ; %bb.1 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp = icmp eq i32 %arg, 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll b/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll index c3b6d8d761f26..99a945202de3b 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s ; Inline spiller can decide to move a spill as early as possible in the basic block. diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index bea2e6d4b45a3..54794cde87f3e 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10077,11 +10077,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) @@ -10273,6 +10273,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_and_b64 s[36:37], vcc, -1 ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[8:15] ; GFX6-NEXT: ;;#ASMEND @@ -10283,19 +10285,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ; def s[24:31] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[0:3] +; GFX6-NEXT: ; def s[4:7] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[4:5] +; GFX6-NEXT: ; def s[34:35] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s33 ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX6-NEXT: s_mov_b64 vcc, s[6:7] -; GFX6-NEXT: s_cbranch_execz .LBB1_2 +; GFX6-NEXT: s_cmov_b64 exec, vcc +; GFX6-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; %bb0 -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10307,18 +10308,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s13, 5 ; GFX6-NEXT: v_writelane_b32 v4, s14, 6 ; GFX6-NEXT: v_writelane_b32 v4, s15, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x85000 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s36, 0x85000 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[2:3] +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x84800 +; GFX6-NEXT: s_mov_b32 s36, 0x84800 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s8, v4, 0 ; GFX6-NEXT: v_readlane_b32 s9, v4, 1 @@ -10330,8 +10331,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s15, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[2:3] +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10343,18 +10344,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s21, 5 ; GFX6-NEXT: v_writelane_b32 v4, s22, 6 ; GFX6-NEXT: v_writelane_b32 v4, s23, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x85800 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s36, 0x85800 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[2:3] +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x85000 +; GFX6-NEXT: s_mov_b32 s36, 0x85000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s16, v4, 0 ; GFX6-NEXT: v_readlane_b32 s17, v4, 1 @@ -10366,8 +10367,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s23, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[2:3] +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10379,18 +10380,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s29, 5 ; GFX6-NEXT: v_writelane_b32 v4, s30, 6 ; GFX6-NEXT: v_writelane_b32 v4, s31, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x86000 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s36, 0x86000 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[2:3] +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x85800 +; GFX6-NEXT: s_mov_b32 s36, 0x85800 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s24, v4, 0 ; GFX6-NEXT: v_readlane_b32 s25, v4, 1 @@ -10402,39 +10403,28 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s31, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s0, 0 -; GFX6-NEXT: v_writelane_b32 v4, s1, 1 -; GFX6-NEXT: v_writelane_b32 v4, s2, 2 -; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: s_mov_b32 s34, 0x86800 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_mov_b64 exec, s[2:3] +; GFX6-NEXT: s_mov_b64 vcc, s[0:1] ; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: s_mov_b64 exec, 3 +; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_writelane_b32 v4, s4, 0 ; GFX6-NEXT: v_writelane_b32 v4, s5, 1 -; GFX6-NEXT: s_mov_b32 s2, 0x86c00 +; GFX6-NEXT: v_writelane_b32 v4, s6, 2 +; GFX6-NEXT: v_writelane_b32 v4, s7, 3 +; GFX6-NEXT: s_mov_b32 s2, 0x86800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] -; GFX6-NEXT: s_mov_b64 s[34:35], exec +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s36, 0x86000 +; GFX6-NEXT: s_mov_b32 s38, 0x86000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s0, v4, 0 ; GFX6-NEXT: v_readlane_b32 s1, v4, 1 @@ -10446,13 +10436,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s7, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[34:35] -; GFX6-NEXT: s_mov_b64 s[34:35], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s44, 0x86800 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x21a0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s36, v4, 0 ; GFX6-NEXT: v_readlane_b32 s37, v4, 1 @@ -10460,18 +10450,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s39, v4, 3 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[34:35] -; GFX6-NEXT: s_mov_b64 s[44:45], exec -; GFX6-NEXT: s_mov_b64 exec, 3 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x21b0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s34, v4, 0 -; GFX6-NEXT: v_readlane_b32 s35, v4, 1 -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[44:45] ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35] @@ -10490,8 +10468,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: .LBB1_2: ; %ret ; GFX6-NEXT: s_or_b64 exec, exec, vcc +; GFX6-NEXT: .LBB1_2: ; %ret ; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 @@ -10656,6 +10634,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[34:35], exec ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224 @@ -10686,8 +10665,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) -; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v7, 13, v4 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v7, 13, v4 +; GFX9-FLATSCR-NEXT: s_and_b64 s[44:45], vcc, -1 ; GFX9-FLATSCR-NEXT: scratch_store_dword v4, v6, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill @@ -10732,8 +10712,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; def s33 ; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[34:35], vcc -; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-FLATSCR-NEXT: s_cmov_b64 exec, vcc +; GFX9-FLATSCR-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; %bb0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39] @@ -10772,8 +10752,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 @@ -10859,7 +10839,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[39:42], v5, s[38:39] offset:16 ; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-FLATSCR-NEXT: v_lshl_add_u32 v4, v0, 13, 16 +; GFX10-FLATSCR-NEXT: s_and_b32 s39, vcc_lo, -1 ; GFX10-FLATSCR-NEXT: scratch_store_dword v4, v6, off ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ; def s[0:7] @@ -10882,8 +10864,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ; def s38 ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX10-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 +; GFX10-FLATSCR-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-FLATSCR-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-FLATSCR-NEXT: ; %bb.1: ; %bb0 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] @@ -11017,8 +10999,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX10-FLATSCR-NEXT: s_or_b32 exec_lo, exec_lo, s33 +; GFX10-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[63:66], s[36:37] offset:112 ; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[59:62], s[36:37] offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index ed7f27b367fda..ec02a7ea31e7a 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -343,40 +343,44 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v4 ; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v13 +; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[8:9], -1 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_mov_b64 s[6:7], exec ; GCN-IR-NEXT: v_mov_b32_e32 v15, v14 -; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[8:9] +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: s_xor_b64 s[8:9], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc @@ -394,34 +398,35 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v10 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v11, vcc +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 ; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 -; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: .LBB1_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v4, v2, v7 ; GCN-IR-NEXT: v_mul_hi_u32 v5, v2, v6 ; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 @@ -1633,21 +1638,25 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec +; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB11_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc @@ -1663,34 +1672,35 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB11_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB11_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB11_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 ; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 @@ -1825,22 +1835,26 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec +; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 +; GCN-IR-NEXT: s_and_b64 s[10:11], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc @@ -1856,34 +1870,35 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB12_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB12_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB12_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 ; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 @@ -1926,26 +1941,30 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], s[6:7] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec +; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 ; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB13_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB13_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB13_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v6 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v10 @@ -1970,23 +1989,24 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB13_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB13_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB13_6: ; %udiv-end ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[4:5], 15 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index 9ad9fa0304865..694e451c688ea 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -31,8 +31,9 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39] ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; MUBUF-NEXT: s_and_saveexec_b32 s0, vcc_lo -; MUBUF-NEXT: s_cbranch_execz .LBB0_2 +; MUBUF-NEXT: s_and_b32 s0, vcc_lo, -1 +; MUBUF-NEXT: s_cmov_b32 exec_lo, vcc_lo +; MUBUF-NEXT: s_cbranch_scc0 .LBB0_2 ; MUBUF-NEXT: ; %bb.1: ; %if.then4.i ; MUBUF-NEXT: v_add_nc_u32_e64 v0, 4, 0x4000 ; MUBUF-NEXT: s_mov_b32 s0, 0x41c64e6d @@ -65,8 +66,9 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; FLATSCR-NEXT: v_mov_b32_e32 v0, s2 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; FLATSCR-NEXT: s_and_saveexec_b32 s0, vcc_lo -; FLATSCR-NEXT: s_cbranch_execz .LBB0_2 +; FLATSCR-NEXT: s_and_b32 s0, vcc_lo, -1 +; FLATSCR-NEXT: s_cmov_b32 exec_lo, vcc_lo +; FLATSCR-NEXT: s_cbranch_scc0 .LBB0_2 ; FLATSCR-NEXT: ; %bb.1: ; %if.then4.i ; FLATSCR-NEXT: s_movk_i32 s0, 0x4000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:4 @@ -92,9 +94,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; MUBUF11-NEXT: s_waitcnt lgkmcnt(0) ; MUBUF11-NEXT: v_mov_b32_e32 v0, s2 ; MUBUF11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; MUBUF11-NEXT: s_mov_b32 s0, exec_lo -; MUBUF11-NEXT: v_cmpx_ne_u32_e32 0, v0 -; MUBUF11-NEXT: s_cbranch_execz .LBB0_2 +; MUBUF11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; MUBUF11-NEXT: s_and_b32 s0, vcc_lo, -1 +; MUBUF11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; MUBUF11-NEXT: s_cbranch_scc0 .LBB0_2 ; MUBUF11-NEXT: ; %bb.1: ; %if.then4.i ; MUBUF11-NEXT: s_movk_i32 s0, 0x4000 ; MUBUF11-NEXT: scratch_load_b64 v[0:1], off, s0 offset:4 @@ -119,9 +122,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; FLATSCR11-NEXT: s_waitcnt lgkmcnt(0) ; FLATSCR11-NEXT: v_mov_b32_e32 v0, s2 ; FLATSCR11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; FLATSCR11-NEXT: s_mov_b32 s0, exec_lo -; FLATSCR11-NEXT: v_cmpx_ne_u32_e32 0, v0 -; FLATSCR11-NEXT: s_cbranch_execz .LBB0_2 +; FLATSCR11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; FLATSCR11-NEXT: s_and_b32 s0, vcc_lo, -1 +; FLATSCR11-NEXT: s_cmov_b32 exec_lo, vcc_lo +; FLATSCR11-NEXT: s_cbranch_scc0 .LBB0_2 ; FLATSCR11-NEXT: ; %bb.1: ; %if.then4.i ; FLATSCR11-NEXT: s_movk_i32 s0, 0x4000 ; FLATSCR11-NEXT: scratch_load_b64 v[0:1], off, s0 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index c6a599094fe43..2356df96748af 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -205,15 +205,17 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; WAVE32-OPT-NEXT: v_and_b32_e32 v0, 1, v0 ; WAVE32-OPT-NEXT: s_mov_b32 s4, exec_lo -; WAVE32-OPT-NEXT: v_cmpx_eq_u32_e32 1, v0 -; WAVE32-OPT-NEXT: s_cbranch_execz .LBB4_2 +; WAVE32-OPT-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; WAVE32-OPT-NEXT: s_and_b32 s5, vcc_lo, -1 +; WAVE32-OPT-NEXT: s_cmov_b32 exec_lo, vcc_lo +; WAVE32-OPT-NEXT: s_cbranch_scc0 .LBB4_2 ; WAVE32-OPT-NEXT: ; %bb.1: ; %bb1 ; WAVE32-OPT-NEXT: s_lshr_b32 s5, s32, 5 ; WAVE32-OPT-NEXT: ;;#ASMSTART ; WAVE32-OPT-NEXT: ; use s5 ; WAVE32-OPT-NEXT: ;;#ASMEND -; WAVE32-OPT-NEXT: .LBB4_2: ; %bb2 ; WAVE32-OPT-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; WAVE32-OPT-NEXT: .LBB4_2: ; %bb2 ; WAVE32-OPT-NEXT: s_setpc_b64 s[30:31] ; ; WAVE64-OPT-LABEL: func_stacksave_nonentry_block: @@ -221,15 +223,17 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; WAVE64-OPT-NEXT: v_and_b32_e32 v0, 1, v0 ; WAVE64-OPT-NEXT: s_mov_b64 s[4:5], exec -; WAVE64-OPT-NEXT: v_cmpx_eq_u32_e32 1, v0 -; WAVE64-OPT-NEXT: s_cbranch_execz .LBB4_2 +; WAVE64-OPT-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; WAVE64-OPT-NEXT: s_and_b64 s[6:7], vcc, -1 +; WAVE64-OPT-NEXT: s_cmov_b64 exec, vcc +; WAVE64-OPT-NEXT: s_cbranch_scc0 .LBB4_2 ; WAVE64-OPT-NEXT: ; %bb.1: ; %bb1 ; WAVE64-OPT-NEXT: s_lshr_b32 s6, s32, 6 ; WAVE64-OPT-NEXT: ;;#ASMSTART ; WAVE64-OPT-NEXT: ; use s6 ; WAVE64-OPT-NEXT: ;;#ASMEND -; WAVE64-OPT-NEXT: .LBB4_2: ; %bb2 ; WAVE64-OPT-NEXT: s_or_b64 exec, exec, s[4:5] +; WAVE64-OPT-NEXT: .LBB4_2: ; %bb2 ; WAVE64-OPT-NEXT: s_setpc_b64 s[30:31] ; ; WAVE32-O0-LABEL: func_stacksave_nonentry_block: @@ -244,29 +248,33 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7 ; WAVE32-O0-NEXT: v_and_b32_e64 v1, 1, v1 -; WAVE32-O0-NEXT: v_cmp_eq_u32_e64 s5, v1, 1 -; WAVE32-O0-NEXT: s_mov_b32 s4, exec_lo +; WAVE32-O0-NEXT: v_cmp_eq_u32_e64 s4, v1, 1 +; WAVE32-O0-NEXT: s_mov_b32 s5, exec_lo ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) -; WAVE32-O0-NEXT: v_writelane_b32 v0, s4, 0 +; WAVE32-O0-NEXT: v_writelane_b32 v0, s5, 0 ; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7 -; WAVE32-O0-NEXT: s_and_b32 s4, s4, s5 -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-O0-NEXT: s_cbranch_execz .LBB4_2 -; WAVE32-O0-NEXT: ; %bb.1: ; %bb1 -; WAVE32-O0-NEXT: s_mov_b32 s4, s32 -; WAVE32-O0-NEXT: s_lshr_b32 s4, s4, 5 -; WAVE32-O0-NEXT: ;;#ASMSTART -; WAVE32-O0-NEXT: ; use s4 -; WAVE32-O0-NEXT: ;;#ASMEND -; WAVE32-O0-NEXT: .LBB4_2: ; %bb2 +; WAVE32-O0-NEXT: s_and_b32 s5, s4, -1 +; WAVE32-O0-NEXT: s_cmov_b32 exec_lo, s4 +; WAVE32-O0-NEXT: s_cbranch_scc1 .LBB4_1 +; WAVE32-O0-NEXT: s_branch .LBB4_2 +; WAVE32-O0-NEXT: .LBB4_1: ; %bb1 ; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1 ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: v_readlane_b32 s4, v0, 0 +; WAVE32-O0-NEXT: s_mov_b32 s5, s32 +; WAVE32-O0-NEXT: s_lshr_b32 s5, s5, 5 +; WAVE32-O0-NEXT: ;;#ASMSTART +; WAVE32-O0-NEXT: ; use s5 +; WAVE32-O0-NEXT: ;;#ASMEND ; WAVE32-O0-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; WAVE32-O0-NEXT: .LBB4_2: ; %bb2 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s7, -1 +; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s7 ; WAVE32-O0-NEXT: ; kill: killed $vgpr0 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -286,31 +294,35 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11] ; WAVE64-O0-NEXT: v_and_b32_e64 v1, 1, v1 -; WAVE64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, 1 -; WAVE64-O0-NEXT: s_mov_b64 s[4:5], exec +; WAVE64-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, 1 +; WAVE64-O0-NEXT: s_mov_b64 s[6:7], exec ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) -; WAVE64-O0-NEXT: v_writelane_b32 v0, s4, 0 -; WAVE64-O0-NEXT: v_writelane_b32 v0, s5, 1 +; WAVE64-O0-NEXT: v_writelane_b32 v0, s6, 0 +; WAVE64-O0-NEXT: v_writelane_b32 v0, s7, 1 ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 ; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11] -; WAVE64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] -; WAVE64-O0-NEXT: s_cbranch_execz .LBB4_2 -; WAVE64-O0-NEXT: ; %bb.1: ; %bb1 -; WAVE64-O0-NEXT: s_mov_b32 s4, s32 -; WAVE64-O0-NEXT: s_lshr_b32 s4, s4, 6 -; WAVE64-O0-NEXT: ;;#ASMSTART -; WAVE64-O0-NEXT: ; use s4 -; WAVE64-O0-NEXT: ;;#ASMEND -; WAVE64-O0-NEXT: .LBB4_2: ; %bb2 +; WAVE64-O0-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; WAVE64-O0-NEXT: s_cmov_b64 exec, s[4:5] +; WAVE64-O0-NEXT: s_cbranch_scc1 .LBB4_1 +; WAVE64-O0-NEXT: s_branch .LBB4_2 +; WAVE64-O0-NEXT: .LBB4_1: ; %bb1 ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11] ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE64-O0-NEXT: v_readlane_b32 s4, v0, 0 ; WAVE64-O0-NEXT: v_readlane_b32 s5, v0, 1 +; WAVE64-O0-NEXT: s_mov_b32 s6, s32 +; WAVE64-O0-NEXT: s_lshr_b32 s6, s6, 6 +; WAVE64-O0-NEXT: ;;#ASMSTART +; WAVE64-O0-NEXT: ; use s6 +; WAVE64-O0-NEXT: ;;#ASMEND ; WAVE64-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; WAVE64-O0-NEXT: .LBB4_2: ; %bb2 +; WAVE64-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; WAVE64-O0-NEXT: s_mov_b64 exec, s[10:11] ; WAVE64-O0-NEXT: ; kill: killed $vgpr0 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -326,21 +338,22 @@ define void @func_stacksave_nonentry_block(i1 %cond) { ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: v_and_b32_e64 v0, 1, v0 -; WAVE32-WWM-PREALLOC-NEXT: v_cmp_eq_u32_e64 s5, v0, 1 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s4, exec_lo -; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v1, s4, 0 -; WAVE32-WWM-PREALLOC-NEXT: s_and_b32 s4, s4, s5 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-WWM-PREALLOC-NEXT: s_cbranch_execz .LBB4_2 -; WAVE32-WWM-PREALLOC-NEXT: ; %bb.1: ; %bb1 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s4, s32 -; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s4, s4, 5 +; WAVE32-WWM-PREALLOC-NEXT: v_cmp_eq_u32_e64 s4, v0, 1 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s5, exec_lo +; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v1, s5, 0 +; WAVE32-WWM-PREALLOC-NEXT: s_and_b32 s5, s4, -1 +; WAVE32-WWM-PREALLOC-NEXT: s_cmov_b32 exec_lo, s4 +; WAVE32-WWM-PREALLOC-NEXT: s_cbranch_scc1 .LBB4_1 +; WAVE32-WWM-PREALLOC-NEXT: s_branch .LBB4_2 +; WAVE32-WWM-PREALLOC-NEXT: .LBB4_1: ; %bb1 +; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v1, 0 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s5, s32 +; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s5, s5, 5 ; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMSTART -; WAVE32-WWM-PREALLOC-NEXT: ; use s4 +; WAVE32-WWM-PREALLOC-NEXT: ; use s5 ; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMEND -; WAVE32-WWM-PREALLOC-NEXT: .LBB4_2: ; %bb2 -; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v1, 0 ; WAVE32-WWM-PREALLOC-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; WAVE32-WWM-PREALLOC-NEXT: .LBB4_2: ; %bb2 ; WAVE32-WWM-PREALLOC-NEXT: ; kill: killed $vgpr1 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir b/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir index 08bdec8871e17..c554f912c2bea 100644 --- a/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir +++ b/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir @@ -17,10 +17,10 @@ body: | bb.1: %2:vgpr_32 = V_MAC_F32_e32 0, %0, %1, implicit $mode, implicit $exec %3:vgpr_32 = V_MED3_F32_e64 0, %1, 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec + SI_WAVE_RECONVERGE %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.2: %4:vgpr_32 = PHI %5, %bb.3, %3, %bb.1 - SI_END_CF %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec EXP_DONE 0, %4, %4, %4, %4, -1, 0, 15, implicit $exec S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir b/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir index c23c8900096fb..19e012413630a 100644 --- a/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir +++ b/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir @@ -26,7 +26,6 @@ body: | ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[PHI]], [[COPY2]], 0, implicit $exec @@ -40,6 +39,7 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET [[PHI1]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 bb.1: liveins: $vgpr0 @@ -56,7 +56,6 @@ body: | S_BRANCH %bb.4 bb.3: - SI_END_CF %8:sreg_64_xexec, implicit-def $exec, implicit-def $scc, implicit $exec %13:sreg_32 = S_MOV_B32 1 %15:vgpr_32 = COPY %13:sreg_32 %10:vgpr_32, dead %20:sreg_64_xexec = V_ADD_CO_U32_e64 %6:vgpr_32, %15:vgpr_32, 0, implicit $exec @@ -68,6 +67,7 @@ body: | %18:sreg_64 = REG_SEQUENCE %16:sreg_32, %subreg.sub0, %17:sreg_32, %subreg.sub1 %19:sgpr_128 = REG_SEQUENCE %12:sreg_64, %subreg.sub0_sub1, %18:sreg_64, %subreg.sub2_sub3 BUFFER_STORE_DWORD_OFFSET %11:vgpr_32, %19:sgpr_128, 0, 0, 0, 0, implicit $exec + SI_WAVE_RECONVERGE %8:sreg_64_xexec, implicit-def $exec, implicit-def $scc, implicit $exec S_BRANCH %bb.2 ... diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index 873567c3ab6f4..7ae0341482cdf 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -12,22 +12,25 @@ define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) n ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 +; CHECK-NEXT: s_mov_b64 s[6:7], exec ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_and_b64 s[8:9], vcc, -1 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: v_mov_b32_e32 v3, s7 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_cmov_b64 exec, vcc +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %ift ; CHECK-NEXT: s_mov_b32 s4, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: v_mov_b32_e32 v3, s7 -; CHECK-NEXT: ; %bb.2: ; %ife ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: .LBB0_2: ; %ife ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; CHECK-NEXT: s_mov_b32 s3, 0xf000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll index ecebbb9ac874f..2fc9f8b8f860b 100644 --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -100,67 +100,67 @@ else: ; preds = %else.if.cond define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(float %val) #0 { ; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000) ; GCN-NEXT: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN-NEXT: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec - ; GCN-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc - ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.3, implicit $exec + ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_XOR_B64 renamable $vcc, $exec, implicit-def $scc + ; GCN-NEXT: dead renamable $sgpr4_sgpr5 = S_AND_B64 renamable $vcc, -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64 killed renamable $vcc, implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.4, implicit killed $scc ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1.Flow1: - ; GCN-NEXT: successors: %bb.6(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.6, implicit $exec - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2.end: - ; GCN-NEXT: successors: %bb.9(0x80000000) - ; GCN-NEXT: liveins: $sgpr2_sgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc - ; GCN-NEXT: S_BRANCH %bb.9 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3.flow.preheader: - ; GCN-NEXT: successors: %bb.4(0x80000000) + ; GCN-NEXT: bb.1.flow.preheader: + ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 0 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.4.flow: - ; GCN-NEXT: successors: %bb.5(0x04000000), %bb.4(0x7c000000) + ; GCN-NEXT: bb.2.flow: + ; GCN-NEXT: successors: %bb.3(0x04000000), %bb.2(0x7c000000) ; GCN-NEXT: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GCN-NEXT: $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc - ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.4, implicit $exec + ; GCN-NEXT: renamable $sgpr6_sgpr7 = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc + ; GCN-NEXT: dead renamable $sgpr8_sgpr9 = S_AND_B64 renamable $sgpr6_sgpr7, -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CSELECT_B64 killed renamable $sgpr6_sgpr7, renamable $sgpr4_sgpr5, implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.5.Flow: - ; GCN-NEXT: successors: %bb.6(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; GCN-NEXT: bb.3.Flow: + ; GCN-NEXT: successors: %bb.4(0x80000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GCN-NEXT: $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GCN-NEXT: $exec = S_OR_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.6.kill0: - ; GCN-NEXT: successors: %bb.7(0x40000000), %bb.8(0x40000000) + ; GCN-NEXT: bb.4.Flow1: + ; GCN-NEXT: successors: %bb.5(0x40000000), %bb.7(0x40000000) ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 renamable $sgpr2_sgpr3, $exec, implicit-def $scc + ; GCN-NEXT: dead renamable $sgpr6_sgpr7 = S_AND_B64 renamable $sgpr2_sgpr3, -1, implicit-def $scc + ; GCN-NEXT: $exec = S_CMOV_B64 killed renamable $sgpr2_sgpr3, implicit $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.7, implicit killed $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5.kill0: + ; GCN-NEXT: successors: %bb.6(0x40000000), %bb.8(0x40000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} ; GCN-NEXT: dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc ; GCN-NEXT: S_CBRANCH_SCC0 %bb.8, implicit $scc ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.7.kill0: - ; GCN-NEXT: successors: %bb.9(0x80000000) - ; GCN-NEXT: liveins: $sgpr2_sgpr3, $scc + ; GCN-NEXT: bb.6.kill0: + ; GCN-NEXT: successors: %bb.7(0x80000000) + ; GCN-NEXT: liveins: $sgpr4_sgpr5, $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: $exec = S_MOV_B64 0 - ; GCN-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.7.end: + ; GCN-NEXT: successors: %bb.9(0x80000000) + ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_BRANCH %bb.9 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.8: diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 837b46f0ce578..9c39bf78684b1 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -93,22 +93,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v3 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v2 -; GLOBALNESS1-NEXT: s_branch .LBB1_4 +; GLOBALNESS1-NEXT: s_branch .LBB1_5 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 -; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow14 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] +; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow15 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_4: ; %Flow28 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_30 -; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 +; GLOBALNESS1-NEXT: .LBB1_5: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0x80 @@ -133,52 +135,54 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[44:45] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 -; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_10 +; GLOBALNESS1-NEXT: ; %bb.6: ; %NodeBlock +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lt_i32 s75, 1 -; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 -; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock12 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_8 +; GLOBALNESS1-NEXT: ; %bb.7: ; %LeafBlock12 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lg_u32 s75, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_8 -; GLOBALNESS1-NEXT: s_branch .LBB1_9 -; GLOBALNESS1-NEXT: .LBB1_7: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_9 +; GLOBALNESS1-NEXT: s_branch .LBB1_10 +; GLOBALNESS1-NEXT: .LBB1_8: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: .LBB1_8: ; %LeafBlock -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_9: ; %LeafBlock +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lg_u32 s75, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_10: ; %Flow25 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 -; GLOBALNESS1-NEXT: ; %bb.10: ; %baz.exit.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; %bb.11: ; %baz.exit.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[2:3] +; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], exec ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[60:61], 0, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 +; GLOBALNESS1-NEXT: s_and_b64 s[4:5], s[60:61], -1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[72:73], s[60:61] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 -; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cmov_b64 exec, s[60:61] +; GLOBALNESS1-NEXT: s_cbranch_scc0 .LBB1_26 +; GLOBALNESS1-NEXT: ; %bb.12: ; %bb33.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 -; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 +; GLOBALNESS1-NEXT: ; %bb.13: ; %bb39.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off -; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_14: ; %bb44.lr.ph.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) @@ -187,15 +191,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[62:63], 0, v2 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 ; GLOBALNESS1-NEXT: s_branch .LBB1_16 -; GLOBALNESS1-NEXT: .LBB1_14: ; %Flow16 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i -; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; Parent Loop BB1_5 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 @@ -245,37 +246,44 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 +; GLOBALNESS1-NEXT: s_and_b64 s[6:7], s[62:63], exec +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], exec +; GLOBALNESS1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GLOBALNESS1-NEXT: s_cmov_b64 exec, s[6:7] +; GLOBALNESS1-NEXT: s_cbranch_scc0 .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[42:43], off -; GLOBALNESS1-NEXT: s_branch .LBB1_14 -; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] +; GLOBALNESS1-NEXT: s_branch .LBB1_15 +; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GLOBALNESS1-NEXT: s_branch .LBB1_3 +; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow23 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow24 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[72:73] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[60:61] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 +; GLOBALNESS1-NEXT: .LBB1_26: ; %bb64.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 +; GLOBALNESS1-NEXT: s_and_b64 s[6:7], s[60:61], exec +; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GLOBALNESS1-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GLOBALNESS1-NEXT: s_cmov_b64 exec, s[6:7] +; GLOBALNESS1-NEXT: s_cbranch_scc0 .LBB1_3 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 ; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off @@ -380,22 +388,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v3 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v2 -; GLOBALNESS0-NEXT: s_branch .LBB1_4 +; GLOBALNESS0-NEXT: s_branch .LBB1_5 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 -; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow14 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] +; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow15 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_4: ; %Flow28 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_30 -; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 +; GLOBALNESS0-NEXT: .LBB1_5: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0x80 @@ -420,52 +430,54 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[44:45] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 -; GLOBALNESS0-NEXT: ; %bb.5: ; %NodeBlock -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_10 +; GLOBALNESS0-NEXT: ; %bb.6: ; %NodeBlock +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lt_i32 s75, 1 -; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7 -; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock12 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_8 +; GLOBALNESS0-NEXT: ; %bb.7: ; %LeafBlock12 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_8 -; GLOBALNESS0-NEXT: s_branch .LBB1_9 -; GLOBALNESS0-NEXT: .LBB1_7: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_9 +; GLOBALNESS0-NEXT: s_branch .LBB1_10 +; GLOBALNESS0-NEXT: .LBB1_8: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: .LBB1_8: ; %LeafBlock -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_9: ; %LeafBlock +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow25 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_10: ; %Flow25 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 -; GLOBALNESS0-NEXT: ; %bb.10: ; %baz.exit.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; %bb.11: ; %baz.exit.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[2:3] +; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], exec ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[60:61], 0, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 +; GLOBALNESS0-NEXT: s_and_b64 s[4:5], s[60:61], -1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[72:73], s[60:61] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 -; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cmov_b64 exec, s[60:61] +; GLOBALNESS0-NEXT: s_cbranch_scc0 .LBB1_26 +; GLOBALNESS0-NEXT: ; %bb.12: ; %bb33.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 -; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 +; GLOBALNESS0-NEXT: ; %bb.13: ; %bb39.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off -; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_14: ; %bb44.lr.ph.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) @@ -474,15 +486,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[62:63], 0, v2 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 ; GLOBALNESS0-NEXT: s_branch .LBB1_16 -; GLOBALNESS0-NEXT: .LBB1_14: ; %Flow16 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i -; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; Parent Loop BB1_5 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 @@ -532,37 +541,44 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 +; GLOBALNESS0-NEXT: s_and_b64 s[6:7], s[62:63], exec +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], exec +; GLOBALNESS0-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GLOBALNESS0-NEXT: s_cmov_b64 exec, s[6:7] +; GLOBALNESS0-NEXT: s_cbranch_scc0 .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[42:43], off -; GLOBALNESS0-NEXT: s_branch .LBB1_14 -; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] +; GLOBALNESS0-NEXT: s_branch .LBB1_15 +; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GLOBALNESS0-NEXT: s_branch .LBB1_3 +; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow23 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[72:73] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[60:61] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 +; GLOBALNESS0-NEXT: .LBB1_26: ; %bb64.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 +; GLOBALNESS0-NEXT: s_and_b64 s[6:7], s[60:61], exec +; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; GLOBALNESS0-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GLOBALNESS0-NEXT: s_cmov_b64 exec, s[6:7] +; GLOBALNESS0-NEXT: s_cbranch_scc0 .LBB1_3 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 ; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_5 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 48b9c72ea6892..86431338ee032 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -310,39 +310,43 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v11, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[6:7], v10, v11 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[6:7] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v10, v11 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[6:7] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[8:9], -1 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_mov_b64 s[6:7], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[8:9] +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v6 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: s_xor_b64 s[8:9], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2 ; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v8 @@ -360,34 +364,35 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0 ; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v6, v0 -; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: .LBB1_6: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v4 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1205,26 +1210,30 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000 +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec +; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GCN-IR-NEXT: s_cbranch_execz .LBB9_5 +; GCN-IR-NEXT: s_and_b64 s[10:11], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc @@ -1240,36 +1249,37 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB9_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 -; GCN-IR-NEXT: .LBB9_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 +; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v0 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB9_6: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v3 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 32768, %x ret i64 %result @@ -1294,25 +1304,29 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], s[6:7] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB10_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB10_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB10_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB10_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v6 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffcf, v8 @@ -1337,23 +1351,24 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8 ; GCN-IR-NEXT: v_and_b32_e32 v8, 0x8000, v8 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB10_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB10_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB10_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 -; GCN-IR-NEXT: .LBB10_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB10_6: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1592,25 +1607,29 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], s[6:7] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB12_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[6:7], v[0:1], v6 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v8 @@ -1634,23 +1653,24 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8 ; GCN-IR-NEXT: v_and_b32_e32 v8, 24, v8 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v7, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB12_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 -; GCN-IR-NEXT: .LBB12_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB12_6: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll index f60a274f1e592..1805a33939a37 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -600,8 +600,9 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; SI-LABEL: uniform_inside_divergent: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_cbranch_execz .LBB11_2 +; SI-NEXT: s_and_b64 s[2:3], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB11_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_load_dword s4, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -623,8 +624,9 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; VI-LABEL: uniform_inside_divergent: ; VI: ; %bb.0: ; %entry ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_and_b64 s[2:3], vcc, -1 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB11_2 ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 @@ -670,14 +672,15 @@ define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 % ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB12_2: ; %if ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_cbranch_execz .LBB12_1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB12_1 ; SI-NEXT: ; %bb.3: ; %if_uniform ; SI-NEXT: v_mov_b32_e32 v0, 1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -693,14 +696,15 @@ define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 % ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB12_2: ; %if ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB12_1 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB12_1 ; VI-NEXT: ; %bb.3: ; %if_uniform ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -728,16 +732,18 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_cbranch_execz .LBB13_2 +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_and_b64 s[6:7], vcc, -1 +; SI-NEXT: s_cmov_b64 exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB13_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: .LBB13_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: .LBB13_2: ; %endif ; SI-NEXT: s_load_dword s0, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s0, 0 @@ -756,16 +762,18 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_mov_b64 s[2:3], exec +; VI-NEXT: s_and_b64 s[6:7], vcc, -1 +; VI-NEXT: s_cmov_b64 exec, vcc +; VI-NEXT: s_cbranch_scc0 .LBB13_2 ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; VI-NEXT: .LBB13_2: ; %endif ; VI-NEXT: s_or_b64 exec, exec, s[2:3] +; VI-NEXT: .LBB13_2: ; %endif ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll index 3597d9a7010d3..88990036de9fe 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck %s ; Test a simple uniform loop that lives inside non-uniform control flow. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll index 5386ef425dcb5..0b0bf59985d59 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll @@ -13,10 +13,11 @@ define amdgpu_ps float @uniform_phi_with_undef(float inreg %c, float %v, i32 %x, ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_cmp_lt_i32_e64 s2, v2, v1 ; GCN-NEXT: s_mov_b32 s1, exec_lo -; GCN-NEXT: s_and_b32 s2, s1, s2 -; GCN-NEXT: s_mov_b32 exec_lo, s2 -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: s_and_b32 s3, s2, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, s2 +; GCN-NEXT: s_cbranch_scc1 .LBB0_1 +; GCN-NEXT: s_branch .LBB0_2 +; GCN-NEXT: .LBB0_1: ; %if ; GCN-NEXT: s_mov_b32 s2, 2.0 ; GCN-NEXT: v_div_scale_f32 v1, s3, s2, s2, v0 ; GCN-NEXT: v_rcp_f32_e64 v2, v1 @@ -30,8 +31,8 @@ define amdgpu_ps float @uniform_phi_with_undef(float inreg %c, float %v, i32 %x, ; GCN-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GCN-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GCN-NEXT: v_div_fixup_f32 v0, v1, s2, v0 -; GCN-NEXT: .LBB0_2: ; %end ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GCN-NEXT: .LBB0_2: ; %end ; GCN-NEXT: v_add_f32_e64 v0, v0, s0 ; GCN-NEXT: ; return to shader part epilog entry: diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index a5e1506114f2d..00a32973a03c2 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -mtriple=amdgcn-amdhsa -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s + +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals ; RUN: opt -S -si-annotate-control-flow -mtriple=amdgcn-amdhsa -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI-OPT %s define hidden void @widget() { @@ -75,16 +77,18 @@ define hidden void @widget() { ; GCN-NEXT: s_and_b64 s[20:21], vcc, exec ; GCN-NEXT: s_or_b64 s[46:47], s[18:19], s[20:21] ; GCN-NEXT: .LBB0_4: ; %Flow2 -; GCN-NEXT: s_and_saveexec_b64 s[18:19], s[46:47] -; GCN-NEXT: s_xor_b64 s[18:19], exec, s[18:19] -; GCN-NEXT: s_cbranch_execz .LBB0_6 +; GCN-NEXT: s_and_b64 s[20:21], s[46:47], exec +; GCN-NEXT: s_and_b64 s[18:19], s[20:21], -1 +; GCN-NEXT: s_mov_b64 s[18:19], exec +; GCN-NEXT: s_cmov_b64 exec, s[20:21] +; GCN-NEXT: s_cbranch_scc0 .LBB0_6 ; GCN-NEXT: ; %bb.5: ; %bb12 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: flat_store_dword v[0:1], v2 -; GCN-NEXT: .LBB0_6: ; %Flow3 ; GCN-NEXT: s_or_b64 exec, exec, s[18:19] +; GCN-NEXT: .LBB0_6: ; %Flow3 ; GCN-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GCN-NEXT: s_cbranch_vccnz .LBB0_8 ; GCN-NEXT: ; %bb.7: ; %bb7 @@ -129,7 +133,7 @@ define hidden void @widget() { ; GCN-NEXT: s_andn2_b64 vcc, exec, s[18:19] ; GCN-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-NEXT: s_branch .LBB0_4 -; SI-OPT-LABEL: @widget( +; SI-OPT-LABEL: define {{[^@]+}}@widget() { ; SI-OPT-NEXT: bb: ; SI-OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(1) null, align 16 ; SI-OPT-NEXT: [[TMP1:%.*]] = icmp slt i32 [[TMP]], 21 @@ -151,13 +155,11 @@ define hidden void @widget() { ; SI-OPT-NEXT: [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP11]]) ; SI-OPT-NEXT: [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0 ; SI-OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1 -; SI-OPT-NEXT: br i1 [[TMP1]], label [[BB6:%.*]], label [[BB9_BB12_CRIT_EDGE:%.*]] -; SI-OPT: bb9.bb12_crit_edge: -; SI-OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) -; SI-OPT-NEXT: br label [[BB12]] +; SI-OPT-NEXT: br i1 [[TMP1]], label [[BB6:%.*]], label [[BB12]] ; SI-OPT: bb12: ; SI-OPT-NEXT: store float 0.000000e+00, ptr addrspace(1) null, align 8 ; SI-OPT-NEXT: ret void +; bb: %tmp = load i32, ptr addrspace(1) null, align 16 %tmp1 = icmp slt i32 %tmp, 21 @@ -192,7 +194,7 @@ declare hidden float @wibble() local_unnamed_addr define hidden void @blam() { -; SI-OPT-LABEL: @blam( +; SI-OPT-LABEL: define {{[^@]+}}@blam() { ; SI-OPT-NEXT: bb: ; SI-OPT-NEXT: [[TMP:%.*]] = load float, ptr null, align 16 ; SI-OPT-NEXT: br label [[BB2:%.*]] @@ -211,7 +213,6 @@ define hidden void @blam() { ; SI-OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1 ; SI-OPT-NEXT: br i1 [[TMP1]], label [[BB8:%.*]], label [[BB6:%.*]] ; SI-OPT: bb6: -; SI-OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) ; SI-OPT-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP3]], 3 ; SI-OPT-NEXT: br i1 [[TMP7]], label [[BB11:%.*]], label [[BB1:%.*]] ; SI-OPT: bb8: @@ -219,10 +220,7 @@ define hidden void @blam() { ; SI-OPT-NEXT: [[TMP3:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP9]]) ; SI-OPT-NEXT: [[TMP4:%.*]] = extractvalue { i1, i64 } [[TMP3]], 0 ; SI-OPT-NEXT: [[TMP5:%.*]] = extractvalue { i1, i64 } [[TMP3]], 1 -; SI-OPT-NEXT: br i1 [[TMP4]], label [[BB10:%.*]], label [[BB8_BB1_CRIT_EDGE:%.*]] -; SI-OPT: bb8.bb1_crit_edge: -; SI-OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP5]]) -; SI-OPT-NEXT: br label [[BB1]] +; SI-OPT-NEXT: br i1 [[TMP4]], label [[BB10:%.*]], label [[BB1]] ; SI-OPT: bb10: ; SI-OPT-NEXT: store float 0x7FF8000000000000, ptr addrspace(5) null, align 16 ; SI-OPT-NEXT: br label [[BB18:%.*]] @@ -234,14 +232,12 @@ define hidden void @blam() { ; SI-OPT-NEXT: [[TMP8:%.*]] = extractvalue { i1, i64 } [[TMP6]], 1 ; SI-OPT-NEXT: br i1 [[TMP7]], label [[BB2]], label [[BB14:%.*]] ; SI-OPT: bb14: -; SI-OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) ; SI-OPT-NEXT: [[TMP15:%.*]] = fcmp nsz oeq float [[TMP]], 0.000000e+00 ; SI-OPT-NEXT: [[TMP9:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]]) ; SI-OPT-NEXT: [[TMP10:%.*]] = extractvalue { i1, i64 } [[TMP9]], 0 ; SI-OPT-NEXT: [[TMP11:%.*]] = extractvalue { i1, i64 } [[TMP9]], 1 ; SI-OPT-NEXT: br i1 [[TMP10]], label [[BB17:%.*]], label [[BB16:%.*]] ; SI-OPT: bb16: -; SI-OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP11]]) ; SI-OPT-NEXT: store float 0x7FF8000000000000, ptr addrspace(5) null, align 16 ; SI-OPT-NEXT: br label [[BB17]] ; SI-OPT: bb17: @@ -315,27 +311,31 @@ define hidden void @blam() { ; GCN-NEXT: s_branch .LBB1_2 ; GCN-NEXT: .LBB1_1: ; %Flow7 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_and_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[50:51], s[4:5], s[50:51] -; GCN-NEXT: s_andn2_b64 exec, exec, s[50:51] -; GCN-NEXT: s_cbranch_execz .LBB1_18 +; GCN-NEXT: s_andn2_b64 s[4:5], exec, s[50:51] +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-NEXT: s_cselect_b64 exec, s[4:5], s[50:51] +; GCN-NEXT: s_cbranch_scc0 .LBB1_18 ; GCN-NEXT: .LBB1_2: ; %bb2 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: flat_load_dword v0, v[41:42] ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], 0 -; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 2, v0 +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_xor_b64 s[54:55], vcc, exec +; GCN-NEXT: s_and_b64 s[4:5], vcc, -1 ; GCN-NEXT: s_mov_b64 s[4:5], -1 -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_xor_b64 s[54:55], exec, s[8:9] -; GCN-NEXT: s_cbranch_execz .LBB1_12 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_12 ; GCN-NEXT: ; %bb.3: ; %bb6 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[56:57], exec ; GCN-NEXT: v_cmp_eq_u32_e64 s[44:45], 3, v0 -; GCN-NEXT: s_and_saveexec_b64 s[56:57], s[44:45] -; GCN-NEXT: s_cbranch_execz .LBB1_11 +; GCN-NEXT: s_and_b64 s[4:5], s[44:45], -1 +; GCN-NEXT: s_cmov_b64 exec, s[44:45] +; GCN-NEXT: s_cbranch_scc0 .LBB1_11 ; GCN-NEXT: ; %bb.4: ; %bb11 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_getpc_b64 s[16:17] @@ -352,81 +352,96 @@ define hidden void @blam() { ; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GCN-NEXT: s_xor_b64 s[4:5], vcc, exec +; GCN-NEXT: s_and_b64 s[6:7], vcc, -1 ; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_10 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_10 ; GCN-NEXT: ; %bb.5: ; %bb14 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_and_b64 s[10:11], s[42:43], exec +; GCN-NEXT: s_and_b64 s[8:9], s[10:11], -1 ; GCN-NEXT: s_mov_b64 s[8:9], s[52:53] -; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[42:43] -; GCN-NEXT: s_cbranch_execz .LBB1_7 +; GCN-NEXT: s_cmov_b64 exec, s[10:11] +; GCN-NEXT: s_cbranch_scc0 .LBB1_7 ; GCN-NEXT: ; %bb.6: ; %bb16 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_or_b64 s[8:9], s[52:53], exec +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB1_7: ; %Flow3 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[8:9] -; GCN-NEXT: s_xor_b64 s[8:9], exec, s[10:11] -; GCN-NEXT: s_cbranch_execz .LBB1_9 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_and_b64 s[10:11], s[8:9], exec +; GCN-NEXT: s_and_b64 s[8:9], s[10:11], -1 +; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: s_cmov_b64 exec, s[10:11] +; GCN-NEXT: s_cbranch_scc0 .LBB1_9 ; GCN-NEXT: ; %bb.8: ; %bb17 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_mov_b64 s[8:9], exec ; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB1_9: ; %Flow4 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_b64 s[6:7], s[8:9], exec +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: .LBB1_10: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_andn2_b64 s[4:5], s[44:45], exec ; GCN-NEXT: s_and_b64 s[8:9], vcc, exec ; GCN-NEXT: s_or_b64 s[44:45], s[4:5], s[8:9] ; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_or_b64 exec, exec, s[56:57] ; GCN-NEXT: .LBB1_11: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[56:57] ; GCN-NEXT: s_orn2_b64 s[4:5], s[44:45], exec ; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_or_b64 exec, exec, s[54:55] ; GCN-NEXT: .LBB1_12: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[54:55] -; GCN-NEXT: s_cbranch_execz .LBB1_16 +; GCN-NEXT: s_xor_b64 s[8:9], s[54:55], exec +; GCN-NEXT: s_and_b64 s[10:11], s[54:55], -1 +; GCN-NEXT: s_cmov_b64 exec, s[54:55] +; GCN-NEXT: s_cbranch_scc0 .LBB1_16 ; GCN-NEXT: ; %bb.13: ; %bb8 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_and_b64 s[10:11], vcc, -1 ; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] -; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_15 +; GCN-NEXT: s_cmov_b64 exec, vcc +; GCN-NEXT: s_cbranch_scc0 .LBB1_15 ; GCN-NEXT: ; %bb.14: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_or_b64 s[10:11], s[6:7], exec +; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: .LBB1_15: ; %Flow6 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_and_b64 s[12:13], vcc, exec ; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec ; GCN-NEXT: s_and_b64 s[10:11], s[10:11], exec ; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: .LBB1_16: ; %Flow5 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB1_1 +; GCN-NEXT: s_and_b64 s[8:9], s[6:7], exec +; GCN-NEXT: s_and_b64 s[6:7], s[8:9], -1 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_cmov_b64 exec, s[8:9] +; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.17: ; %bb18 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_branch .LBB1_1 ; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock -; GCN-NEXT: s_or_b64 exec, exec, s[50:51] ; GCN-NEXT: v_readlane_b32 s57, v45, 25 ; GCN-NEXT: v_readlane_b32 s56, v45, 24 ; GCN-NEXT: v_readlane_b32 s55, v45, 23 @@ -524,3 +539,11 @@ declare hidden float @spam() !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +;. +; SI-OPT: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; SI-OPT: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn } +; SI-OPT: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(none) } +; SI-OPT: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +;. +; SI-OPT: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} +;. diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index f35589853393c..59f5eda491f66 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -319,39 +319,43 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v13 +; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], vcc +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[8:9], -1 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_mov_b64 s[6:7], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[8:9] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[8:9] +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 +; GCN-IR-NEXT: s_xor_b64 s[8:9], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc @@ -369,34 +373,35 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 ; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB1_5: ; %Flow4 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 -; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: .LBB1_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v4, v2, v7 ; GCN-IR-NEXT: v_mul_hi_u32 v5, v2, v6 ; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 @@ -1227,22 +1232,26 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB8_6 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec +; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_cmov_b64 exec, s[4:5] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB8_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[4:5], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GCN-IR-NEXT: s_cbranch_execz .LBB8_5 +; GCN-IR-NEXT: s_and_b64 s[10:11], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB8_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc @@ -1258,34 +1267,35 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v1 ; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB8_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB8_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB8_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB8_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB8_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 ; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 @@ -1318,25 +1328,29 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: s_xor_b64 s[10:11], s[4:5], -1 +; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], s[6:7] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_mov_b64 s[8:9], exec ; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 +; GCN-IR-NEXT: s_cmov_b64 exec, s[6:7] +; GCN-IR-NEXT: s_cbranch_scc0 .LBB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], vcc, exec ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] -; GCN-IR-NEXT: s_cbranch_execz .LBB9_5 +; GCN-IR-NEXT: s_and_b64 s[4:5], vcc, -1 +; GCN-IR-NEXT: s_cmov_b64 exec, vcc +; GCN-IR-NEXT: s_cbranch_scc0 .LBB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v6 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v10 @@ -1361,23 +1375,24 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v10, 0x8000, v10 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 +; GCN-IR-NEXT: s_andn2_b64 s[4:5], exec, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] -; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 +; GCN-IR-NEXT: s_cselect_b64 exec, s[4:5], s[10:11] +; GCN-IR-NEXT: s_cbranch_scc1 .LBB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: .LBB9_5: ; %Flow4 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-IR-NEXT: .LBB9_6: ; %Flow5 -; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: .LBB9_6: ; %udiv-end ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[4:5], 15 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll index 9a64a6d99f46f..202f5dfe4ffa0 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -enable-misched -asm-verbose -disable-block-placement -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll index 2c66d38a1be62..c7a54557da680 100644 --- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll +++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll index 0211c5111c31d..8ffb56d56ca7b 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -14,7 +14,6 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] ; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: .LBB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -26,13 +25,14 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 +; GCN-NEXT: s_xor_b32 s5, exec_lo, s4 ; GCN-NEXT: buffer_store_dword v0, v0, s[8:11], 0 offen +; GCN-NEXT: s_and_b32 s6, s5, -1 ; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 -; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: s_cbranch_execnz .LBB0_2 +; GCN-NEXT: s_cselect_b32 exec_lo, s5, s4 +; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1 -; GCN-NEXT: s_mov_b32 exec_lo, s5 ; GCN-NEXT: s_mov_b32 vcc_lo, exec_lo ; GCN-NEXT: s_cbranch_vccnz .LBB0_1 ; GCN-NEXT: ; %bb.4: ; %DummyReturnBlock @@ -46,7 +46,6 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB0_2 Depth 2 ; GFX11-NEXT: flat_load_b128 v[2:5], v[0:1] -; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB0_2: ; Parent Loop BB0_1 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -60,13 +59,14 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 ; GFX11-NEXT: buffer_store_b32 v0, v0, s[4:7], 0 offen +; GFX11-NEXT: s_and_b32 s2, s1, -1 ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB0_2 +; GFX11-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX11-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX11-NEXT: ; %bb.4: ; %DummyReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index bfc249e9081d2..cb0022caccab3 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -26,17 +26,18 @@ define amdgpu_ps float @else1(i32 %z, float %v) #0 { ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[PHI1]], 0, [[PHI1]], 0, 0, implicit $mode, implicit $exec + ; SI-NEXT: SI_WAVE_RECONVERGE killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.4 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.3.else: ; SI-NEXT: successors: %bb.1(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, 1077936128, 0, killed [[COPY]], 0, 0, implicit $mode, implicit $exec + ; SI-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.4.end: ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[V_ADD_F32_e64_]], %bb.2 - ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: $vgpr0 = COPY killed [[PHI2]] ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 main_body: @@ -82,18 +83,19 @@ define amdgpu_ps float @else2(i32 %z, float %v) #0 { ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; SI-NEXT: SI_WAVE_RECONVERGE killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.4 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.3.else: ; SI-NEXT: successors: %bb.1(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, 1077936128, 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; SI-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.4.end: ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, [[V_ADD_F32_e64_]], %bb.2 ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[V_ADD_F32_e64_]], %bb.2 - ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[PHI2]], 0, killed [[PHI3]], 0, 0, implicit $mode, implicit $exec ; SI-NEXT: $vgpr0 = COPY killed [[V_ADD_F32_e64_1]] ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 @@ -152,6 +154,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[PHI]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; SI-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, killed [[PHI4]], 0, implicit $exec + ; SI-NEXT: SI_WAVE_RECONVERGE killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.5 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.4.else: @@ -159,6 +162,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY2]], 0, [[PHI1]], 0, 0, implicit $mode, implicit $exec ; SI-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 killed [[PHI1]], 1, [[PHI1]], implicit $exec + ; SI-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.5.if.end: @@ -166,7 +170,6 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: {{ $}} ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.2, [[V_MUL_F32_e64_]], %bb.3 ; SI-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.2, [[V_ADD_U32_e64_]], %bb.3 - ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[PHI6]], 0, implicit $exec ; SI-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[PHI]], 1, implicit-def dead $scc ; SI-NEXT: S_CMP_LT_I32 [[S_ADD_I32_]], [[COPY1]], implicit-def $scc @@ -233,10 +236,10 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %47:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %49:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %51:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %53:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %49:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %51:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %53:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %55:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -244,13 +247,12 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: successors: %bb.3(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI2]], %subreg.sub0, killed [[PHI3]], %subreg.sub1 - ; SI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; SI-NEXT: {{ $}} ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 - ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.4, [[PHI1]], %bb.2 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %57:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %59:vgpr_32, %bb.4, [[PHI1]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -267,27 +269,26 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.3, implicit $exec + ; SI-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc + ; SI-NEXT: SI_WATERFALL_LOOP killed [[S_XOR_B32_term]], killed [[S_AND_SAVEEXEC_B32_]], %bb.3, implicit $exec ; SI-NEXT: {{ $}} ; SI-NEXT: bb.5: ; SI-NEXT: successors: %bb.10(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] ; SI-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] + ; SI-NEXT: SI_WAVE_RECONVERGE killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.10 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.else: ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1 - ; SI-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %59:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 - ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %61:vgpr_32, %bb.8, [[COPY4]], %bb.6 + ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %61:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.8, [[COPY4]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -304,19 +305,18 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.7, implicit $exec + ; SI-NEXT: [[S_XOR_B32_term1:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc + ; SI-NEXT: SI_WATERFALL_LOOP killed [[S_XOR_B32_term1]], killed [[S_AND_SAVEEXEC_B32_1]], %bb.7, implicit $exec ; SI-NEXT: {{ $}} ; SI-NEXT: bb.9: ; SI-NEXT: successors: %bb.1(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]] ; SI-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]] + ; SI-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.10.end: ; SI-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.5 - ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: $vgpr0 = COPY killed [[PHI8]] ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 main_body: @@ -356,9 +356,9 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %48:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %50:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %52:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %50:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %52:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %54:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -366,12 +366,11 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: successors: %bb.3(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI1]], %subreg.sub0, killed [[PHI2]], %subreg.sub1 - ; SI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; SI-NEXT: {{ $}} ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %54:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -388,26 +387,25 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.3, implicit $exec + ; SI-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc + ; SI-NEXT: SI_WATERFALL_LOOP killed [[S_XOR_B32_term]], killed [[S_AND_SAVEEXEC_B32_]], %bb.3, implicit $exec ; SI-NEXT: {{ $}} ; SI-NEXT: bb.5: ; SI-NEXT: successors: %bb.10(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] ; SI-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] + ; SI-NEXT: SI_WAVE_RECONVERGE killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.10 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.else: ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1 - ; SI-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %58:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -424,19 +422,18 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; SI-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.7, implicit $exec + ; SI-NEXT: [[S_XOR_B32_term1:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc + ; SI-NEXT: SI_WATERFALL_LOOP killed [[S_XOR_B32_term1]], killed [[S_AND_SAVEEXEC_B32_1]], %bb.7, implicit $exec ; SI-NEXT: {{ $}} ; SI-NEXT: bb.9: ; SI-NEXT: successors: %bb.1(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]] ; SI-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]] + ; SI-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.10.end: ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.5 - ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[PHI5]], 0, killed [[COPY4]], 0, 0, implicit $mode, implicit $exec ; SI-NEXT: $vgpr0 = COPY killed [[V_ADD_F32_e64_]] ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 @@ -480,6 +477,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) ; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) + ; SI-NEXT: SI_WAVE_RECONVERGE killed %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.7 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.2.if.then9: @@ -512,10 +510,10 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %35:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) + ; SI-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.5 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7.UnifiedReturnBlock: - ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_ENDPGM 0 entry: %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -585,7 +583,6 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_1]].sub0 ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_256 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3, killed [[COPY5]], %subreg.sub4, killed [[COPY4]], %subreg.sub5, killed [[COPY3]], %subreg.sub6, killed [[COPY2]], %subreg.sub7 ; SI-NEXT: [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 killed [[GLOBAL_LOAD_DWORDX2_SADDR]], 48, 0, implicit $exec :: (invariant load (s128) from %ir.add.ptr.i, addrspace 4) - ; SI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; SI-NEXT: {{ $}} ; SI-NEXT: bb.2: ; SI-NEXT: successors: %bb.3(0x80000000) @@ -615,8 +612,6 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo - ; SI-NEXT: {{ $}} ; SI-NEXT: bb.4: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} @@ -636,18 +631,16 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %22:vreg_64, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) - ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.4, implicit $exec + ; SI-NEXT: [[S_XOR_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc + ; SI-NEXT: SI_WATERFALL_LOOP killed [[S_XOR_B32_term]], killed [[S_AND_SAVEEXEC_B32_1]], %bb.4, implicit $exec ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.7(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]] - ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc - ; SI-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; SI-NEXT: [[S_XOR_B32_term1:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc + ; SI-NEXT: SI_WATERFALL_LOOP killed [[S_XOR_B32_term1]], killed [[S_AND_SAVEEXEC_B32_]], %bb.2, implicit $exec ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7: - ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] ; SI-NEXT: GLOBAL_STORE_DWORD undef %25:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; SI-NEXT: S_ENDPGM 0 entry: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll index 25d8300eb4583..143f6ea9075ec 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -7,25 +7,24 @@ define amdgpu_ps float @else1(i32 %z, float %v) #0 { ; SI: ; %bb.0: ; %main_body ; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo -; SI-NEXT: s_xor_b32 s0, exec_lo, s0 -; SI-NEXT: s_cbranch_execnz .LBB0_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b32 s0, s0 -; SI-NEXT: s_cbranch_execnz .LBB0_4 -; SI-NEXT: .LBB0_2: ; %end -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; SI-NEXT: s_branch .LBB0_5 -; SI-NEXT: .LBB0_3: ; %else +; SI-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; SI-NEXT: s_and_b32 s1, vcc_lo, -1 +; SI-NEXT: s_cmov_b32 exec_lo, vcc_lo +; SI-NEXT: s_cbranch_scc0 .LBB0_2 +; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_andn2_saveexec_b32 s0, s0 -; SI-NEXT: s_cbranch_execz .LBB0_2 -; SI-NEXT: .LBB0_4: ; %if -; SI-NEXT: v_add_f32_e32 v0, v1, v1 ; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; SI-NEXT: s_branch .LBB0_5 -; SI-NEXT: .LBB0_5: +; SI-NEXT: .LBB0_2: ; %Flow +; SI-NEXT: s_xor_b32 s1, s0, exec_lo +; SI-NEXT: s_and_b32 s2, s0, -1 +; SI-NEXT: s_cmov_b32 exec_lo, s0 +; SI-NEXT: s_cbranch_scc0 .LBB0_4 +; SI-NEXT: ; %bb.3: ; %if +; SI-NEXT: v_add_f32_e32 v0, v1, v1 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; SI-NEXT: .LBB0_4: ; %end +; SI-NEXT: ; return to shader part epilog main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else @@ -50,17 +49,23 @@ define amdgpu_ps float @else2(i32 %z, float %v) #0 { ; SI: ; %bb.0: ; %main_body ; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo -; SI-NEXT: s_xor_b32 s0, exec_lo, s0 +; SI-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; SI-NEXT: s_and_b32 s1, vcc_lo, -1 +; SI-NEXT: s_cmov_b32 exec_lo, vcc_lo +; SI-NEXT: s_cbranch_scc0 .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 -; SI-NEXT: ; %bb.2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b32 s0, s0 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SI-NEXT: .LBB1_2: ; %Flow +; SI-NEXT: s_xor_b32 s1, s0, exec_lo +; SI-NEXT: s_and_b32 s2, s0, -1 +; SI-NEXT: s_cmov_b32 exec_lo, s0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.3: ; %if ; SI-NEXT: v_add_f32_e32 v1, v1, v1 ; SI-NEXT: v_mov_b32_e32 v0, v1 -; SI-NEXT: ; %bb.4: ; %end -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; SI-NEXT: .LBB1_4: ; %end ; SI-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-NEXT: ; return to shader part epilog main_body: @@ -91,30 +96,36 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-NEXT: s_branch .LBB2_2 ; SI-NEXT: .LBB2_1: ; %if.end ; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; SI-NEXT: v_add_nc_u32_e32 v2, 1, v3 ; SI-NEXT: s_add_i32 s1, s1, 1 ; SI-NEXT: s_cmp_lt_i32 s1, s0 ; SI-NEXT: s_cbranch_scc0 .LBB2_6 ; SI-NEXT: .LBB2_2: ; %for.body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_and_b32 s3, vcc_lo, exec_lo ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_and_saveexec_b32 s2, vcc_lo -; SI-NEXT: s_xor_b32 s2, exec_lo, s2 +; SI-NEXT: s_xor_b32 s2, s3, exec_lo +; SI-NEXT: s_and_b32 s4, s3, -1 +; SI-NEXT: s_cmov_b32 exec_lo, s3 +; SI-NEXT: s_cbranch_scc0 .LBB2_4 ; SI-NEXT: ; %bb.3: ; %else ; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; SI-NEXT: v_mul_f32_e32 v0, v1, v2 ; SI-NEXT: v_lshl_add_u32 v3, v2, 1, v2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; %bb.4: ; %Flow +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; SI-NEXT: .LBB2_4: ; %Flow ; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; SI-NEXT: s_andn2_saveexec_b32 s2, s2 -; SI-NEXT: s_cbranch_execz .LBB2_1 +; SI-NEXT: s_xor_b32 s3, s2, exec_lo +; SI-NEXT: s_and_b32 s4, s2, -1 +; SI-NEXT: s_cmov_b32 exec_lo, s2 +; SI-NEXT: s_cbranch_scc0 .LBB2_1 ; SI-NEXT: ; %bb.5: ; %if ; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; SI-NEXT: v_mul_f32_e32 v0, s1, v1 ; SI-NEXT: v_add_nc_u32_e32 v3, 1, v2 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; SI-NEXT: s_branch .LBB2_1 ; SI-NEXT: .LBB2_6: ; %for.end ; SI-NEXT: v_add_f32_e32 v0, v3, v0 @@ -162,60 +173,62 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-LABEL: loop: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0x31c16000 ; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6 -; SI-NEXT: s_mov_b32 s15, 0x31c16000 -; SI-NEXT: s_add_u32 s12, s12, s1 -; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: s_add_u32 s8, s8, s1 +; SI-NEXT: s_addc_u32 s9, s9, 0 ; SI-NEXT: s_mov_b32 s32, 0 +; SI-NEXT: s_xor_b32 s6, vcc_lo, exec_lo +; SI-NEXT: s_and_b32 s0, vcc_lo, -1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo -; SI-NEXT: s_xor_b32 s6, exec_lo, s0 -; SI-NEXT: s_cbranch_execz .LBB3_4 -; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s7, exec_lo -; SI-NEXT: .LBB3_2: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_cmov_b32 exec_lo, vcc_lo +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_readfirstlane_b32 s4, v4 ; SI-NEXT: v_readfirstlane_b32 s5, v5 ; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] -; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo -; SI-NEXT: s_mov_b64 s[0:1], s[12:13] -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_and_saveexec_b32 s7, vcc_lo +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: s_xor_b32 s0, exec_lo, s7 ; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 -; SI-NEXT: s_cbranch_execnz .LBB3_2 -; SI-NEXT: ; %bb.3: -; SI-NEXT: s_mov_b32 exec_lo, s7 +; SI-NEXT: s_and_b32 s1, s0, -1 +; SI-NEXT: s_cselect_b32 exec_lo, s0, s7 +; SI-NEXT: s_cbranch_scc1 .LBB3_1 +; SI-NEXT: ; %bb.2: ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: .LBB3_4: ; %Flow -; SI-NEXT: s_andn2_saveexec_b32 s6, s6 -; SI-NEXT: s_cbranch_execz .LBB3_8 -; SI-NEXT: ; %bb.5: ; %if -; SI-NEXT: s_mov_b32 s7, exec_lo -; SI-NEXT: .LBB3_6: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; SI-NEXT: .LBB3_3: ; %Flow +; SI-NEXT: s_xor_b32 s7, s6, exec_lo +; SI-NEXT: s_and_b32 s0, s6, -1 +; SI-NEXT: s_cmov_b32 exec_lo, s6 +; SI-NEXT: s_cbranch_scc0 .LBB3_6 +; SI-NEXT: .LBB3_4: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: v_readfirstlane_b32 s5, v3 ; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] -; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo -; SI-NEXT: s_mov_b64 s[0:1], s[12:13] -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_and_saveexec_b32 s6, vcc_lo +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: s_xor_b32 s0, exec_lo, s6 ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 -; SI-NEXT: s_cbranch_execnz .LBB3_6 -; SI-NEXT: ; %bb.7: -; SI-NEXT: s_mov_b32 exec_lo, s7 -; SI-NEXT: .LBB3_8: ; %end -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; SI-NEXT: s_and_b32 s1, s0, -1 +; SI-NEXT: s_cselect_b32 exec_lo, s0, s6 +; SI-NEXT: s_cbranch_scc1 .LBB3_4 +; SI-NEXT: ; %bb.5: +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; SI-NEXT: .LBB3_6: ; %end ; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: ; return to shader part epilog main_body: @@ -239,57 +252,59 @@ end: define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %extern_func, ptr %extern_func2) #0 { ; SI-LABEL: loop_with_use: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 -; SI-NEXT: s_mov_b32 s15, 0x31c16000 -; SI-NEXT: s_add_u32 s12, s12, s1 -; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0x31c16000 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: s_add_u32 s8, s8, s1 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_xor_b32 s6, vcc_lo, exec_lo +; SI-NEXT: s_and_b32 s0, vcc_lo, -1 ; SI-NEXT: s_mov_b32 s32, 0 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo -; SI-NEXT: s_xor_b32 s6, exec_lo, s0 -; SI-NEXT: s_cbranch_execz .LBB4_4 -; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s7, exec_lo -; SI-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_cmov_b32 exec_lo, vcc_lo +; SI-NEXT: s_cbranch_scc0 .LBB4_3 +; SI-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_readfirstlane_b32 s4, v4 ; SI-NEXT: v_readfirstlane_b32 s5, v5 ; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] -; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo +; SI-NEXT: s_and_saveexec_b32 s7, vcc_lo ; SI-NEXT: v_mov_b32_e32 v0, v40 -; SI-NEXT: s_mov_b64 s[0:1], s[12:13] -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_xor_b32 s0, exec_lo, s7 ; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 -; SI-NEXT: s_cbranch_execnz .LBB4_2 -; SI-NEXT: ; %bb.3: -; SI-NEXT: s_mov_b32 exec_lo, s7 +; SI-NEXT: s_and_b32 s1, s0, -1 +; SI-NEXT: s_cselect_b32 exec_lo, s0, s7 +; SI-NEXT: s_cbranch_scc1 .LBB4_1 +; SI-NEXT: ; %bb.2: ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: .LBB4_4: ; %Flow -; SI-NEXT: s_andn2_saveexec_b32 s6, s6 -; SI-NEXT: s_cbranch_execz .LBB4_8 -; SI-NEXT: ; %bb.5: ; %if -; SI-NEXT: s_mov_b32 s7, exec_lo -; SI-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; SI-NEXT: .LBB4_3: ; %Flow +; SI-NEXT: s_xor_b32 s7, s6, exec_lo +; SI-NEXT: s_and_b32 s0, s6, -1 +; SI-NEXT: s_cmov_b32 exec_lo, s6 +; SI-NEXT: s_cbranch_scc0 .LBB4_6 +; SI-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: v_readfirstlane_b32 s5, v3 ; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] -; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo +; SI-NEXT: s_and_saveexec_b32 s6, vcc_lo ; SI-NEXT: v_mov_b32_e32 v0, v40 -; SI-NEXT: s_mov_b64 s[0:1], s[12:13] -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_xor_b32 s0, exec_lo, s6 ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 -; SI-NEXT: s_cbranch_execnz .LBB4_6 -; SI-NEXT: ; %bb.7: -; SI-NEXT: s_mov_b32 exec_lo, s7 -; SI-NEXT: .LBB4_8: ; %end -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; SI-NEXT: s_and_b32 s1, s0, -1 +; SI-NEXT: s_cselect_b32 exec_lo, s0, s6 +; SI-NEXT: s_cbranch_scc1 .LBB4_4 +; SI-NEXT: ; %bb.5: +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; SI-NEXT: .LBB4_6: ; %end ; SI-NEXT: v_add_f32_e32 v0, v0, v40 ; SI-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll index 4efa1e9353ab3..36e6727eddba8 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll @@ -75,15 +75,17 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK: ; %bb.0: ; %.entry ; CHECK-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 -; CHECK-NEXT: s_mov_b32 s0, exec_lo ; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; CHECK-NEXT: v_lshlrev_b64_e32 v[3:4], 2, v[3:4] ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:336 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; CHECK-NEXT: s_and_b32 s1, vcc_lo, -1 ; CHECK-NEXT: scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:448 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 @@ -94,9 +96,8 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:720 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: v_cmpx_eq_u32_e32 0, v2 -; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0 -; CHECK-NEXT: s_cbranch_execz .LBB1_2 +; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo +; CHECK-NEXT: s_cbranch_scc0 .LBB1_2 ; CHECK-NEXT: ; %bb.1: ; %.false ; CHECK-NEXT: global_load_b32 v10, v[0:1], off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 @@ -153,9 +154,13 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; CHECK-NEXT: .LBB1_2: ; %Flow -; CHECK-NEXT: s_and_not1_saveexec_b32 s0, s0 -; CHECK-NEXT: s_cbranch_execz .LBB1_4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_xor_b32 s1, s0, exec_lo +; CHECK-NEXT: s_and_b32 s2, s0, -1 +; CHECK-NEXT: s_cmov_b32 exec_lo, s0 +; CHECK-NEXT: s_cbranch_scc0 .LBB1_4 ; CHECK-NEXT: ; %bb.3: ; %.true ; CHECK-NEXT: global_load_b32 v10, v[0:1], off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 @@ -207,8 +212,8 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; CHECK-NEXT: .LBB1_4: ; %.exit -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; CHECK-NEXT: scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll index 20dc5ad5c8665..eaa5be96c208c 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -24,28 +24,29 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, s4 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: ds_write_b8 v1, v2 -; CHECK-NEXT: s_mov_b64 s[4:5], exec -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: s_mov_b64 s[6:7], exec +; CHECK-NEXT: v_writelane_b32 v0, s6, 0 +; CHECK-NEXT: v_writelane_b32 v0, s7, 1 ; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[8:9] -; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 -; CHECK-NEXT: ; %bb.1: ; %bb193 -; CHECK-NEXT: .LBB0_2: ; %bb194 +; CHECK-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[4:5] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 +; CHECK-NEXT: s_branch .LBB0_2 +; CHECK-NEXT: .LBB0_1: ; %bb193 ; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v1, 0 -; CHECK-NEXT: v_readlane_b32 s5, v1, 1 +; CHECK-NEXT: v_readlane_b32 s4, v0, 0 +; CHECK-NEXT: v_readlane_b32 s5, v0, 1 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB0_2: ; %bb194 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index f78b408d78255..dcb74e2f26eff 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -6,26 +6,28 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v5, 2, v0 -; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v2, v5, s[4:5] +; GFX906-NEXT: global_load_dword v2, v3, s[4:5] +; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB0_2 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dword v2, v5, s[6:7] +; GFX906-NEXT: global_load_dword v2, v3, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: global_store_byte v1, v0, s[2:3] offset:2 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v4 ; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: global_store_byte v1, v3, s[2:3] offset:2 ; GFX906-NEXT: global_store_short v1, v0, s[2:3] ; GFX906-NEXT: s_endpgm entry: @@ -50,30 +52,32 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v2, v6, s[4:5] +; GFX906-NEXT: global_load_dword v2, v3, s[4:5] +; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB1_2 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dword v2, v6, s[6:7] +; GFX906-NEXT: global_load_dword v2, v3, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v5 -; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v5 +; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dword v1, v0, s[2:3] ; GFX906-NEXT: s_endpgm entry: @@ -98,31 +102,33 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v7, 3, v0 -; GFX906-NEXT: v_mov_b32_e32 v5, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[4:5] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[4:5] +; GFX906-NEXT: v_mov_b32_e32 v5, 0 +; GFX906-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB2_2 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v6 -; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_byte v5, v2, s[2:3] offset:4 +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4 +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v3 +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dword v5, v0, s[2:3] ; GFX906-NEXT: s_endpgm entry: @@ -147,42 +153,44 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v10, 3, v0 -; GFX906-NEXT: v_mov_b32_e32 v3, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[4:5] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[4:5] +; GFX906-NEXT: v_mov_b32_e32 v3, 0 +; GFX906-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB3_2 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v9 -; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v7 -; GFX906-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v6 -; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4 -; GFX906-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] +; GFX906-NEXT: v_lshlrev_b16_e32 v6, 8, v9 +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v6, 8, v7 +; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -206,64 +214,66 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v18, 4, v0 -; GFX906-NEXT: v_mov_b32_e32 v5, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[4:5] +; GFX906-NEXT: global_load_dwordx4 v[1:4], v13, s[4:5] +; GFX906-NEXT: v_mov_b32_e32 v5, 0 +; GFX906-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB4_2 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[6:7] +; GFX906-NEXT: global_load_dwordx4 v[1:4], v13, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1 +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v17 -; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15 -; GFX906-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v14 -; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v12 -; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v11 -; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v9 -; GFX906-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v8 -; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v6 -; GFX906-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v5, v[0:3], s[2:3] +; GFX906-NEXT: v_lshlrev_b16_e32 v13, 8, v17 +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v13, 8, v15 +; GFX906-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v13, 8, v14 +; GFX906-NEXT: v_lshlrev_b16_e32 v11, 8, v11 +; GFX906-NEXT: v_lshlrev_b16_e32 v10, 8, v10 +; GFX906-NEXT: v_lshlrev_b16_e32 v8, 8, v8 +; GFX906-NEXT: v_lshlrev_b16_e32 v7, 8, v7 +; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -286,13 +296,15 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-LABEL: v32i8_liveout: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: v_lshlrev_b32_e32 v31, 5, v0 +; GFX906-NEXT: v_lshlrev_b32_e32 v24, 5, v0 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX906-NEXT: v_mov_b32_e32 v9, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[2:3], exec ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[4:5] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[4:5] +; GFX906-NEXT: global_load_dwordx4 v[1:4], v24, s[4:5] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v24, s[4:5] +; GFX906-NEXT: v_mov_b32_e32 v9, 0 +; GFX906-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 ; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4 @@ -310,20 +322,20 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshrrev_b32_e32 v21, 24, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v22, 16, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v24, 24, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v25, 24, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v27, 8, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v30, 8, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v31, 24, v5 ; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5 ; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB5_2 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[6:7] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[6:7] +; GFX906-NEXT: global_load_dwordx4 v[1:4], v24, s[6:7] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v24, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 ; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4 @@ -341,35 +353,35 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshrrev_b32_e32 v21, 24, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v22, 16, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v8 -; GFX906-NEXT: v_lshrrev_b32_e32 v24, 24, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v25, 24, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v27, 8, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v30, 8, v6 +; GFX906-NEXT: v_lshrrev_b32_e32 v31, 24, v5 ; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5 ; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5 -; GFX906-NEXT: .LBB5_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30 +; GFX906-NEXT: .LBB5_2: ; %bb.2 +; GFX906-NEXT: v_lshlrev_b16_e32 v24, 8, v31 ; GFX906-NEXT: v_lshlrev_b16_e32 v31, 8, v33 -; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; GFX906-NEXT: v_lshlrev_b16_e32 v27, 8, v27 -; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26 -; GFX906-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; GFX906-NEXT: v_or_b32_sdwa v24, v32, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v5, v5, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v24, 8, v27 +; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30 +; GFX906-NEXT: v_lshlrev_b16_e32 v28, 8, v28 +; GFX906-NEXT: v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v24, 8, v25 ; GFX906-NEXT: v_lshlrev_b16_e32 v23, 8, v23 ; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 -; GFX906-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v7, v7, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v6, v6, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v24, v26, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v8, v8, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v6, v6, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1] @@ -415,932 +427,934 @@ bb.2: define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v256i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX906-NEXT: s_mov_b32 s10, -1 -; GFX906-NEXT: s_mov_b32 s11, 0xe00000 -; GFX906-NEXT: s_add_u32 s8, s8, s3 +; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_mov_b32 s14, -1 +; GFX906-NEXT: s_mov_b32 s15, 0xe00000 +; GFX906-NEXT: s_add_u32 s12, s12, s3 ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v63, 3, v0 -; GFX906-NEXT: s_addc_u32 s9, s9, 0 +; GFX906-NEXT: v_lshlrev_b32_e32 v62, 3, v0 +; GFX906-NEXT: s_addc_u32 s13, s13, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[4:5] offset:240 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[4:5] offset:224 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[4:5] offset:208 -; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[4:5] offset:192 +; GFX906-NEXT: global_load_dwordx4 v[17:20], v62, s[4:5] offset:240 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v62, s[4:5] offset:224 +; GFX906-NEXT: global_load_dwordx4 v[9:12], v62, s[4:5] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v62, s[4:5] offset:192 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_mov_b64 s[8:9], exec ; GFX906-NEXT: v_mov_b32_e32 v4, 0 +; GFX906-NEXT: s_and_b64 s[0:1], vcc, -1 ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:144 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:148 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:152 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:156 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:160 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:164 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:168 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:176 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:180 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:172 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:188 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:192 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:184 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:200 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:204 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[4:5] offset:176 -; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[4:5] offset:160 +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:196 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[17:20], v62, s[4:5] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[21:24], v62, s[4:5] offset:160 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:208 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:212 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:224 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:216 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:220 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:236 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:228 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:232 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:248 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:240 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:244 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:252 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:256 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[4:5] offset:144 -; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[4:5] offset:128 +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[25:28], v62, s[4:5] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[29:32], v62, s[4:5] offset:128 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:392 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:384 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:388 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[4:5] offset:112 -; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[4:5] offset:96 +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:396 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[33:36], v62, s[4:5] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[37:40], v62, s[4:5] offset:96 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:400 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:404 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:416 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:408 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:412 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:428 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:420 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:424 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:440 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:432 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:436 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:444 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:448 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:452 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:464 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:456 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:460 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:476 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:468 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:472 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:488 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:480 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:484 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[4:5] offset:80 -; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[4:5] offset:64 +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:492 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[41:44], v62, s[4:5] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[45:48], v62, s[4:5] offset:64 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:496 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:500 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:512 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:504 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:508 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:524 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:516 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:520 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:536 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:528 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:532 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:540 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:544 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:548 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:560 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:552 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:556 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:572 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:564 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:568 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:584 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:576 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:580 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[4:5] offset:48 -; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[4:5] offset:32 +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:588 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[49:52], v62, s[4:5] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[53:56], v62, s[4:5] offset:32 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:592 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:596 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:608 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:600 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:604 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:620 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:612 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:616 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:632 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:624 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:628 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:636 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:640 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:644 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:656 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:648 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:652 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:668 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:660 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:664 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:680 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:672 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:676 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[4:5] offset:16 +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:684 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[57:60], v62, s[4:5] offset:16 ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[4:5] +; GFX906-NEXT: global_load_dwordx4 v[0:3], v62, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:688 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:692 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:704 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:696 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:700 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:716 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:708 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:712 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:728 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:720 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:724 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:732 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:736 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:740 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:752 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:744 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:748 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:764 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:756 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB6_2 +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:760 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v0 +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:768 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v0 +; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v1 +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:772 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v0 +; GFX906-NEXT: s_cmov_b64 exec, vcc +; GFX906-NEXT: s_cbranch_scc0 .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[6:7] offset:240 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[6:7] offset:224 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[6:7] offset:208 -; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[6:7] offset:192 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v62, s[6:7] offset:240 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v62, s[6:7] offset:224 +; GFX906-NEXT: global_load_dwordx4 v[9:12], v62, s[6:7] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v62, s[6:7] offset:192 ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v3 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v3 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v2 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v1 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v0 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:144 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:148 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:152 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:156 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:160 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:164 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:168 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:176 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:180 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:172 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:188 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:192 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:184 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:200 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:204 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[6:7] offset:176 -; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[6:7] offset:160 +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:196 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[17:20], v62, s[6:7] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[21:24], v62, s[6:7] offset:160 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:208 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:212 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:224 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:216 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:220 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:236 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:228 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:232 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:248 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:240 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:244 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:252 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:256 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[6:7] offset:144 -; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[6:7] offset:128 +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[25:28], v62, s[6:7] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[29:32], v62, s[6:7] offset:128 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:392 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:384 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:388 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[6:7] offset:112 -; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[6:7] offset:96 +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:396 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[33:36], v62, s[6:7] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[37:40], v62, s[6:7] offset:96 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:400 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:404 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:416 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:408 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:412 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:428 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:420 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:424 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:440 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:432 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:436 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:444 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:448 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:452 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:464 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:456 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:460 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:476 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:468 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:472 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:488 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:480 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:484 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[6:7] offset:80 -; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[6:7] offset:64 +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:492 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[41:44], v62, s[6:7] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[45:48], v62, s[6:7] offset:64 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:496 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:500 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:512 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:504 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:508 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:524 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:516 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:520 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:536 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:528 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:532 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:540 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:544 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:548 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:560 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:552 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:556 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:572 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:564 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:568 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:584 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:576 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:580 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[6:7] offset:48 -; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[6:7] offset:32 +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:588 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[49:52], v62, s[6:7] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[53:56], v62, s[6:7] offset:32 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:592 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:596 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:608 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:600 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:604 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:620 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:612 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:616 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:632 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:624 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:628 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:636 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:640 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:644 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:656 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:648 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:652 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:668 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:660 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:664 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:680 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:672 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:676 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53 -; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[6:7] offset:16 +; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:684 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[57:60], v62, s[6:7] offset:16 ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[6:7] +; GFX906-NEXT: global_load_dwordx4 v[0:3], v62, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:688 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:692 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:704 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:696 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:700 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:716 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:708 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:712 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:728 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:720 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:724 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:732 ; 4-byte Folded Spill ; GFX906-NEXT: s_waitcnt vmcnt(12) ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:736 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:740 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:752 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:744 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:748 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:764 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:756 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0 -; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0 +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:760 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v0 +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:768 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v0 +; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v1 +; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:772 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v0 +; GFX906-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX906-NEXT: .LBB6_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 +; GFX906-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v63 ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload -; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62 -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v63, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v61, off, s[12:15], 0 offset:764 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v62, off, s[12:15], 0 offset:772 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v63, off, s[12:15], 0 offset:760 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v61, off, s[12:15], 0 offset:752 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v61, off, s[12:15], 0 offset:768 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v62, off, s[12:15], 0 offset:756 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v61, off, s[12:15], 0 offset:744 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62 ; GFX906-NEXT: v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v62, off, s[12:15], 0 offset:748 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v61, off, s[12:15], 0 offset:736 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v62, off, s[12:15], 0 offset:740 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:732 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:728 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:716 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:704 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v57, off, s[12:15], 0 offset:720 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v58, off, s[12:15], 0 offset:724 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v59, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v59, off, s[12:15], 0 offset:712 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v58, off, s[12:15], 0 offset:708 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v57, off, s[12:15], 0 offset:696 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v58, 8, v58 ; GFX906-NEXT: v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v58, off, s[12:15], 0 offset:700 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v57, off, s[12:15], 0 offset:688 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v58, off, s[12:15], 0 offset:692 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1349,42 +1363,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:684 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:680 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:668 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:656 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v53, off, s[12:15], 0 offset:672 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:676 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v55, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v55, off, s[12:15], 0 offset:664 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:660 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v53, off, s[12:15], 0 offset:648 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v54, 8, v54 ; GFX906-NEXT: v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:652 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v53, off, s[12:15], 0 offset:640 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:644 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1393,42 +1407,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:32 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:636 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:632 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:620 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:608 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v49, off, s[12:15], 0 offset:624 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v50, off, s[12:15], 0 offset:628 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v51, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v51, off, s[12:15], 0 offset:616 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v50, off, s[12:15], 0 offset:612 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v49, off, s[12:15], 0 offset:600 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v50, 8, v50 ; GFX906-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v50, off, s[12:15], 0 offset:604 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v49, off, s[12:15], 0 offset:592 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v50, off, s[12:15], 0 offset:596 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1437,42 +1451,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:48 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:588 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:584 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:572 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:560 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v45, off, s[12:15], 0 offset:576 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v46, off, s[12:15], 0 offset:580 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v47, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v47, off, s[12:15], 0 offset:568 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v46, off, s[12:15], 0 offset:564 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v45, off, s[12:15], 0 offset:552 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v46, 8, v46 ; GFX906-NEXT: v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v46, off, s[12:15], 0 offset:556 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v45, off, s[12:15], 0 offset:544 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v46, off, s[12:15], 0 offset:548 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1481,42 +1495,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:64 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:540 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:536 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:524 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:512 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v41, off, s[12:15], 0 offset:528 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v42, off, s[12:15], 0 offset:532 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v43, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v43, off, s[12:15], 0 offset:520 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v42, off, s[12:15], 0 offset:516 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v41, off, s[12:15], 0 offset:504 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v42, 8, v42 ; GFX906-NEXT: v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v42, off, s[12:15], 0 offset:508 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v41, off, s[12:15], 0 offset:496 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v42, off, s[12:15], 0 offset:500 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1525,42 +1539,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:80 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:492 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:488 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:476 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:464 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v37, off, s[12:15], 0 offset:480 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v38, off, s[12:15], 0 offset:484 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v39, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v39, off, s[12:15], 0 offset:472 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v38, off, s[12:15], 0 offset:468 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v37, off, s[12:15], 0 offset:456 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v38, 8, v38 ; GFX906-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v38, off, s[12:15], 0 offset:460 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v37, off, s[12:15], 0 offset:448 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v38, off, s[12:15], 0 offset:452 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1569,42 +1583,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:96 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:444 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:440 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:428 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:416 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:432 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v34, off, s[12:15], 0 offset:436 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v35, off, s[12:15], 0 offset:424 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v34, off, s[12:15], 0 offset:420 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:408 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX906-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v34, off, s[12:15], 0 offset:412 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:400 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v34, off, s[12:15], 0 offset:404 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1613,42 +1627,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:112 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:396 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:392 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:380 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:368 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:384 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:388 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:376 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:372 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:360 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30 ; GFX906-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:364 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:352 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:356 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1657,42 +1671,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:128 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:348 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:344 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:332 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:320 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:336 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:340 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v27, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:328 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:324 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:312 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26 ; GFX906-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:316 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:304 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:308 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1701,42 +1715,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:144 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:300 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:296 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:284 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:272 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:288 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:292 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v23, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:280 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:276 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:264 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v22, 8, v22 ; GFX906-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:268 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:256 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:260 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1745,42 +1759,42 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:160 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:252 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:248 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:236 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:224 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:240 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:244 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v19, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:232 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17 ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:228 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:216 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v18, 8, v18 ; GFX906-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:220 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:208 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:212 ; 4-byte Folded Reload ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1789,36 +1803,36 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:176 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:200 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:204 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:192 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:184 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:188 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:164 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:196 ; 4-byte Folded Reload ; GFX906-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:180 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v13, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:176 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:160 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:172 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX906-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -1826,27 +1840,27 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:168 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:192 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:156 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:152 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:148 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:140 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:128 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:144 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:136 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1854,9 +1868,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v10, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:132 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1864,8 +1878,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1874,21 +1888,21 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:208 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(3) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1896,9 +1910,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1906,8 +1920,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -1916,15 +1930,15 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224 -; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(7) ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: s_waitcnt vmcnt(3) @@ -1934,9 +1948,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1944,9 +1958,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX906-NEXT: s_waitcnt vmcnt(1) @@ -1954,8 +1968,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll index 1afe5cdea8723..c6f5756cbfef8 100644 --- a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll @@ -18,16 +18,16 @@ declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 i define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-LABEL: foo: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX906-NEXT: s_mov_b32 s10, -1 -; GFX906-NEXT: s_mov_b32 s11, 0xe00000 -; GFX906-NEXT: s_add_u32 s8, s8, s3 -; GFX906-NEXT: s_addc_u32 s9, s9, 0 -; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 -; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:4 -; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:8 -; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:12 +; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_mov_b32 s14, -1 +; GFX906-NEXT: s_mov_b32 s15, 0xe00000 +; GFX906-NEXT: s_add_u32 s12, s12, s3 +; GFX906-NEXT: s_addc_u32 s13, s13, 0 +; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:4 +; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:8 +; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:12 ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x24 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1c ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -39,8 +39,7 @@ define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_mov_b32 s4, 0 ; GFX906-NEXT: v_mov_b32_e32 v1, v0 -; GFX906-NEXT: s_cselect_b32 s5, 1, 0 -; GFX906-NEXT: s_mov_b64 s[2:3], exec +; GFX906-NEXT: s_cselect_b32 s2, 1, 0 ; GFX906-NEXT: ds_write_b64 v2, v[0:1] ; GFX906-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: s_waitcnt vmcnt(3) @@ -55,12 +54,13 @@ define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], v[5:6] ; GFX906-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX906-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX906-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; GFX906-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 -; GFX906-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_cbranch_execnz .LBB0_1 +; GFX906-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GFX906-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX906-NEXT: ; %bb.2: -; GFX906-NEXT: s_cmp_lg_u32 s5, 0 -; GFX906-NEXT: s_mov_b64 exec, s[2:3] +; GFX906-NEXT: s_cmp_lg_u32 s2, 0 ; GFX906-NEXT: s_cselect_b32 s5, 0x3ff00000, 0 ; GFX906-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX906-NEXT: s_mov_b32 s5, s4 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 901e88a4c6aca..537c00c74e319 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -318,8 +318,9 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_mask_if: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 10, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_2 +; GFX1032-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; %if ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -331,8 +332,9 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1064-LABEL: test_mask_if: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 10, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_2 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; %if ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -362,20 +364,22 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_branch .LBB10_2 ; GFX1032-NEXT: .LBB10_1: ; %bb13 ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfe, v4 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, 1, v4 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execz .LBB10_8 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s2 +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_8 ; GFX1032-NEXT: .LBB10_2: ; %bb2 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0 ; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0 +; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: s_mov_b32 s3, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB10_4 +; GFX1032-NEXT: s_and_b32 s6, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX1032-NEXT: ; %bb.3: ; %bb5 ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1032-NEXT: v_ashrrev_i32_e32 v2, 31, v1 @@ -390,27 +394,35 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v4 ; GFX1032-NEXT: s_and_b32 s6, vcc_lo, exec_lo ; GFX1032-NEXT: s_or_b32 s4, s4, s6 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: .LBB10_4: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_and_b32 s6, s4, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr4 -; GFX1032-NEXT: s_and_saveexec_b32 s5, s4 -; GFX1032-NEXT: s_xor_b32 s4, exec_lo, s5 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s4 +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_6 ; GFX1032-NEXT: ; %bb.5: ; %bb11 ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1032-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo ; GFX1032-NEXT: v_add_nc_u32_e32 v4, v1, v4 ; GFX1032-NEXT: v_ashrrev_i32_e32 v4, 1, v4 -; GFX1032-NEXT: ; %bb.6: ; %Flow1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1032-NEXT: .LBB10_6: ; %Flow1 ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032-NEXT: s_and_saveexec_b32 s4, s3 -; GFX1032-NEXT: s_cbranch_execz .LBB10_1 +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, exec_lo +; GFX1032-NEXT: s_and_b32 s5, s3, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s3 +; GFX1032-NEXT: s_cbranch_scc0 .LBB10_1 ; GFX1032-NEXT: ; %bb.7: ; %bb10 ; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1032-NEXT: v_mov_b32_e32 v4, v1 ; GFX1032-NEXT: global_store_dword v[2:3], v0, off +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_branch .LBB10_1 ; GFX1032-NEXT: .LBB10_8: ; %bb1 ; GFX1032-NEXT: s_endpgm @@ -424,20 +436,22 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: s_branch .LBB10_2 ; GFX1064-NEXT: .LBB10_1: ; %bb13 ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0xfe, v4 ; GFX1064-NEXT: v_add_nc_u32_e32 v1, 1, v4 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB10_8 +; GFX1064-NEXT: s_andn2_b64 s[4:5], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[6:7], s[4:5], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[4:5], s[2:3] +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_8 ; GFX1064-NEXT: .LBB10_2: ; %bb2 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_cmp_ge_i32_e64 s[6:7], v1, v0 ; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, v1, v0 +; GFX1064-NEXT: v_cmp_ge_i32_e64 s[6:7], v1, v0 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: s_mov_b64 s[4:5], 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB10_4 +; GFX1064-NEXT: s_and_b64 s[10:11], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_4 ; GFX1064-NEXT: ; %bb.3: ; %bb5 ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1064-NEXT: v_ashrrev_i32_e32 v2, 31, v1 @@ -452,27 +466,35 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 11, v4 ; GFX1064-NEXT: s_and_b64 s[10:11], vcc, exec ; GFX1064-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: .LBB10_4: ; %Flow ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_and_b64 s[10:11], s[6:7], -1 ; GFX1064-NEXT: ; implicit-def: $vgpr4 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] -; GFX1064-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; GFX1064-NEXT: s_cmov_b64 exec, s[6:7] +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_6 ; GFX1064-NEXT: ; %bb.5: ; %bb11 ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1064-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GFX1064-NEXT: v_add_nc_u32_e32 v4, v1, v4 ; GFX1064-NEXT: v_ashrrev_i32_e32 v4, 1, v4 -; GFX1064-NEXT: ; %bb.6: ; %Flow1 +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: .LBB10_6: ; %Flow1 ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GFX1064-NEXT: s_cbranch_execz .LBB10_1 +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; GFX1064-NEXT: s_cmov_b64 exec, s[4:5] +; GFX1064-NEXT: s_cbranch_scc0 .LBB10_1 ; GFX1064-NEXT: ; %bb.7: ; %bb10 ; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; GFX1064-NEXT: v_mov_b32_e32 v4, v1 ; GFX1064-NEXT: global_store_dword v[2:3], v0, off +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064-NEXT: s_branch .LBB10_1 ; GFX1064-NEXT: .LBB10_8: ; %bb1 ; GFX1064-NEXT: s_endpgm @@ -517,8 +539,9 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032: ; %bb.0: ; %bb ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB11_6 +; GFX1032-NEXT: s_and_b32 s3, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX1032-NEXT: ; %bb.1: ; %.preheader ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0 @@ -540,8 +563,10 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 ; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4 ; GFX1032-NEXT: s_or_b32 s2, s5, s2 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: s_cbranch_execz .LBB11_6 +; GFX1032-NEXT: s_andn2_b32 s5, exec_lo, s2 +; GFX1032-NEXT: s_and_b32 s6, s5, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s5, s2 +; GFX1032-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX1032-NEXT: .LBB11_4: ; %bb2 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -561,8 +586,9 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1064: ; %bb.0: ; %bb ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB11_6 +; GFX1064-NEXT: s_and_b64 s[2:3], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX1064-NEXT: ; %bb.1: ; %.preheader ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0 @@ -584,8 +610,10 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1 ; GFX1064-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GFX1064-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX1064-NEXT: s_cbranch_execz .LBB11_6 +; GFX1064-NEXT: s_andn2_b64 s[8:9], exec, s[2:3] +; GFX1064-NEXT: s_and_b64 s[10:11], s[8:9], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[8:9], s[2:3] +; GFX1064-NEXT: s_cbranch_scc0 .LBB11_6 ; GFX1064-NEXT: .LBB11_4: ; %bb2 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1267,20 +1295,22 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 +; GFX1032-NEXT: s_and_b32 s1, s0, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx3 v[1:3], v1, s[6:7] -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB22_2 +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s0 +; GFX1032-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; %bb ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_b32 vcc_lo, vcc_lo, exec_lo +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: .LBB22_2: ; %exit -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3 @@ -1295,20 +1325,22 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b64 vcc, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx3 v[1:3], v1, s[6:7] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB22_2 +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_and_b64 s[6:7], s[0:1], -1 +; GFX1064-NEXT: s_cmov_b64 exec, s[0:1] +; GFX1064-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; %bb ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: global_load_dword v0, v0, s[8:9] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_b64 vcc, vcc, exec -; GFX1064-NEXT: .LBB22_2: ; %exit -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB22_2: ; %exit ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 @@ -1544,8 +1576,10 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-NEXT: s_add_i32 s2, s2, 1 ; GFX1032-NEXT: s_and_b32 s3, exec_lo, s3 ; GFX1032-NEXT: s_or_b32 s0, s3, s0 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_cbranch_execz .LBB27_4 +; GFX1032-NEXT: s_andn2_b32 s3, exec_lo, s0 +; GFX1032-NEXT: s_and_b32 s4, s3, -1 +; GFX1032-NEXT: s_cselect_b32 exec_lo, s3, s0 +; GFX1032-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX1032-NEXT: .LBB27_2: ; %bb1 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_or_b32 s1, s1, exec_lo @@ -1561,7 +1595,6 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-NEXT: s_or_b32 s1, s1, s3 ; GFX1032-NEXT: s_branch .LBB27_1 ; GFX1032-NEXT: .LBB27_4: ; %bb9 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 7 ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_write_b32 v0, v0 @@ -1582,8 +1615,10 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1064-NEXT: s_add_i32 s4, s4, 1 ; GFX1064-NEXT: s_and_b64 s[6:7], exec, s[6:7] ; GFX1064-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_cbranch_execz .LBB27_4 +; GFX1064-NEXT: s_andn2_b64 s[6:7], exec, s[0:1] +; GFX1064-NEXT: s_and_b64 s[8:9], s[6:7], -1 +; GFX1064-NEXT: s_cselect_b64 exec, s[6:7], s[0:1] +; GFX1064-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX1064-NEXT: .LBB27_2: ; %bb1 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_or_b64 s[2:3], s[2:3], exec @@ -1599,7 +1634,6 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1064-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX1064-NEXT: s_branch .LBB27_1 ; GFX1064-NEXT: .LBB27_4: ; %bb9 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v0, 7 ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_write_b32 v0, v0 @@ -1911,11 +1945,13 @@ define amdgpu_ps float @test_wwm2(i32 inreg %idx) { ; GFX1032-LABEL: test_wwm2: ; GFX1032: ; %bb.0: ; %main_body ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX1032-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB35_2 +; GFX1032-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB35_2 ; GFX1032-NEXT: ; %bb.1: ; %if ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -1925,18 +1961,20 @@ define amdgpu_ps float @test_wwm2(i32 inreg %idx) { ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mov_b32_e32 v0, v2 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1032-NEXT: .LBB35_2: ; %endif ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: .LBB35_2: ; %endif ; GFX1032-NEXT: ; return to shader part epilog ; ; GFX1064-LABEL: test_wwm2: ; GFX1064: ; %bb.0: ; %main_body ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX1064-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB35_2 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB35_2 ; GFX1064-NEXT: ; %bb.1: ; %if ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -1946,8 +1984,8 @@ define amdgpu_ps float @test_wwm2(i32 inreg %idx) { ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v0, v2 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1064-NEXT: .LBB35_2: ; %endif ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB35_2: ; %endif ; GFX1064-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -1998,11 +2036,13 @@ define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) { ; GFX1032-LABEL: test_strict_wwm2: ; GFX1032: ; %bb.0: ; %main_body ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX1032-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB37_2 +; GFX1032-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX1032-NEXT: s_cbranch_scc0 .LBB37_2 ; GFX1032-NEXT: ; %bb.1: ; %if ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -2012,18 +2052,20 @@ define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) { ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mov_b32_e32 v0, v2 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1032-NEXT: .LBB37_2: ; %endif ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: .LBB37_2: ; %endif ; GFX1032-NEXT: ; return to shader part epilog ; ; GFX1064-LABEL: test_strict_wwm2: ; GFX1064: ; %bb.0: ; %main_body ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX1064-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB37_2 +; GFX1064-NEXT: s_and_b64 s[4:5], vcc, -1 +; GFX1064-NEXT: s_cmov_b64 exec, vcc +; GFX1064-NEXT: s_cbranch_scc0 .LBB37_2 ; GFX1064-NEXT: ; %bb.1: ; %if ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -2033,8 +2075,8 @@ define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) { ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v0, v2 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX1064-NEXT: .LBB37_2: ; %endif ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: .LBB37_2: ; %endif ; GFX1064-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -2497,10 +2539,13 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: s_and_b32 s0, s0, exec_lo +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s0 +; GFX1032-NEXT: s_cbranch_scc0 .LBB50_2 ; GFX1032-NEXT: ; %bb.1: ; %if.then ; GFX1032-NEXT: ; divergent unreachable -; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1032-NEXT: .LBB50_2: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: icmp64: @@ -2531,10 +2576,13 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cmov_b64 exec, s[0:1] +; GFX1064-NEXT: s_cbranch_scc0 .LBB50_2 ; GFX1064-NEXT: ; %bb.1: ; %if.then ; GFX1064-NEXT: ; divergent unreachable -; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1064-NEXT: .LBB50_2: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2590,10 +2638,13 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: s_and_b32 s0, s0, exec_lo +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s0 +; GFX1032-NEXT: s_cbranch_scc0 .LBB51_2 ; GFX1032-NEXT: ; %bb.1: ; %if.then ; GFX1032-NEXT: ; divergent unreachable -; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1032-NEXT: .LBB51_2: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: fcmp64: @@ -2622,10 +2673,13 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cmov_b64 exec, s[0:1] +; GFX1064-NEXT: s_cbranch_scc0 .LBB51_2 ; GFX1064-NEXT: ; %bb.1: ; %if.then ; GFX1064-NEXT: ; divergent unreachable -; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1064-NEXT: .LBB51_2: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2684,10 +2738,13 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: s_and_b32 s0, s0, exec_lo +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s0 +; GFX1032-NEXT: s_cbranch_scc0 .LBB52_2 ; GFX1032-NEXT: ; %bb.1: ; %if.then ; GFX1032-NEXT: ; divergent unreachable -; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1032-NEXT: .LBB52_2: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: icmp32: @@ -2718,10 +2775,13 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cmov_b64 exec, s[0:1] +; GFX1064-NEXT: s_cbranch_scc0 .LBB52_2 ; GFX1064-NEXT: ; %bb.1: ; %if.then ; GFX1064-NEXT: ; divergent unreachable -; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1064-NEXT: .LBB52_2: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2776,10 +2836,13 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: s_and_b32 s0, s0, exec_lo +; GFX1032-NEXT: s_and_b32 s1, s0, -1 +; GFX1032-NEXT: s_cmov_b32 exec_lo, s0 +; GFX1032-NEXT: s_cbranch_scc0 .LBB53_2 ; GFX1032-NEXT: ; %bb.1: ; %if.then ; GFX1032-NEXT: ; divergent unreachable -; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1032-NEXT: .LBB53_2: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: fcmp32: @@ -2808,10 +2871,13 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX1064-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX1064-NEXT: s_cmov_b64 exec, s[0:1] +; GFX1064-NEXT: s_cbranch_scc0 .LBB53_2 ; GFX1064-NEXT: ; %bb.1: ; %if.then ; GFX1064-NEXT: ; divergent unreachable -; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX1064-NEXT: .LBB53_2: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll index 13b37b40ee95c..b570d5a247529 100644 --- a/llvm/test/CodeGen/AMDGPU/while-break.ll +++ b/llvm/test/CodeGen/AMDGPU/while-break.ll @@ -4,47 +4,57 @@ define amdgpu_ps float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 { ; GCN-LABEL: while_break: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s1, -1 -; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, -1 +; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_branch .LBB0_2 ; GCN-NEXT: .LBB0_1: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: s_and_b32 s2, exec_lo, s3 -; GCN-NEXT: s_or_b32 s0, s2, s0 -; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GCN-NEXT: s_cbranch_execz .LBB0_8 +; GCN-NEXT: s_and_b32 s2, exec_lo, s2 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_andn2_b32 s2, exec_lo, s1 +; GCN-NEXT: s_and_b32 s3, s2, -1 +; GCN-NEXT: s_cselect_b32 exec_lo, s2, s1 +; GCN-NEXT: s_cbranch_scc0 .LBB0_8 ; GCN-NEXT: .LBB0_2: ; %header ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_add_i32 s1, s1, 1 +; GCN-NEXT: s_add_i32 s0, s0, 1 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_cmp_ge_i32_e32 vcc_lo, s1, v2 -; GCN-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GCN-NEXT: s_xor_b32 s3, exec_lo, s3 +; GCN-NEXT: v_cmp_ge_i32_e32 vcc_lo, s0, v2 +; GCN-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: s_and_b32 s4, vcc_lo, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %else ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v3 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v3 ; GCN-NEXT: s_and_b32 s2, vcc_lo, exec_lo -; GCN-NEXT: ; %bb.4: ; %Flow +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GCN-NEXT: .LBB0_4: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_andn2_saveexec_b32 s3, s3 +; GCN-NEXT: s_xor_b32 s4, s3, exec_lo +; GCN-NEXT: s_and_b32 s5, s3, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, s3 +; GCN-NEXT: s_cbranch_scc0 .LBB0_6 ; GCN-NEXT: ; %bb.5: ; %if ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_or_b32 s2, s2, exec_lo -; GCN-NEXT: ; %bb.6: ; %Flow1 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GCN-NEXT: .LBB0_6: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GCN-NEXT: s_mov_b32 s3, -1 -; GCN-NEXT: s_and_saveexec_b32 s4, s2 -; GCN-NEXT: s_cbranch_execz .LBB0_1 +; GCN-NEXT: s_and_b32 s4, s2, exec_lo +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_and_b32 s5, s4, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, s4 +; GCN-NEXT: s_cbranch_scc0 .LBB0_1 ; GCN-NEXT: ; %bb.7: ; %latch ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v0 -; GCN-NEXT: s_orn2_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v0 +; GCN-NEXT: s_orn2_b32 s2, vcc_lo, exec_lo +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GCN-NEXT: s_branch .LBB0_1 ; GCN-NEXT: .LBB0_8: ; %end -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog entry: @@ -79,49 +89,59 @@ end: define amdgpu_ps float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 { ; GCN-LABEL: while_break2: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s1, -1 -; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s0, -1 +; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_branch .LBB1_2 ; GCN-NEXT: .LBB1_1: ; %Flow2 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: s_and_b32 s2, exec_lo, s3 -; GCN-NEXT: s_or_b32 s0, s2, s0 -; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GCN-NEXT: s_cbranch_execz .LBB1_8 +; GCN-NEXT: s_and_b32 s2, exec_lo, s2 +; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_andn2_b32 s2, exec_lo, s1 +; GCN-NEXT: s_and_b32 s3, s2, -1 +; GCN-NEXT: s_cselect_b32 exec_lo, s2, s1 +; GCN-NEXT: s_cbranch_scc0 .LBB1_8 ; GCN-NEXT: .LBB1_2: ; %header ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_add_i32 s1, s1, 1 +; GCN-NEXT: s_add_i32 s0, s0, 1 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_cmp_ge_i32_e32 vcc_lo, s1, v2 -; GCN-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GCN-NEXT: s_xor_b32 s3, exec_lo, s3 +; GCN-NEXT: v_cmp_ge_i32_e32 vcc_lo, s0, v2 +; GCN-NEXT: s_xor_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: s_and_b32 s4, vcc_lo, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GCN-NEXT: s_cbranch_scc0 .LBB1_4 ; GCN-NEXT: ; %bb.3: ; %if ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_mov_b32 s2, exec_lo -; GCN-NEXT: ; %bb.4: ; %Flow +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GCN-NEXT: .LBB1_4: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_andn2_saveexec_b32 s3, s3 +; GCN-NEXT: s_xor_b32 s4, s3, exec_lo +; GCN-NEXT: s_and_b32 s5, s3, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, s3 +; GCN-NEXT: s_cbranch_scc0 .LBB1_6 ; GCN-NEXT: ; %bb.5: ; %else ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v3 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v3 ; GCN-NEXT: s_andn2_b32 s2, s2, exec_lo -; GCN-NEXT: s_and_b32 s4, vcc_lo, exec_lo -; GCN-NEXT: s_or_b32 s2, s2, s4 -; GCN-NEXT: ; %bb.6: ; %Flow1 +; GCN-NEXT: s_and_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GCN-NEXT: .LBB1_6: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GCN-NEXT: s_mov_b32 s3, -1 -; GCN-NEXT: s_and_saveexec_b32 s4, s2 -; GCN-NEXT: s_cbranch_execz .LBB1_1 +; GCN-NEXT: s_and_b32 s4, s2, exec_lo +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_and_b32 s5, s4, -1 +; GCN-NEXT: s_cmov_b32 exec_lo, s4 +; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.7: ; %latch ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s1, v0 -; GCN-NEXT: s_orn2_b32 s3, vcc_lo, exec_lo +; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v0 +; GCN-NEXT: s_orn2_b32 s2, vcc_lo, exec_lo +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GCN-NEXT: s_branch .LBB1_1 ; GCN-NEXT: .LBB1_8: ; %end -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog entry: diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 95dfb12c8dbae..babb79a3359ae 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -505,9 +505,11 @@ define amdgpu_ps float @test_wwm3(i32 inreg %idx) { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 @@ -517,18 +519,20 @@ define amdgpu_ps float @test_wwm3(i32 inreg %idx) { ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-W64-NEXT: .LBB13_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB13_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_wwm3: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB13_2 +; GFX10-W32-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 @@ -538,8 +542,8 @@ define amdgpu_ps float @test_wwm3(i32 inreg %idx) { ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-W32-NEXT: .LBB13_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB13_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -570,9 +574,11 @@ define amdgpu_ps float @test_wwm4(i32 inreg %idx) { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB14_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 @@ -581,18 +587,20 @@ define amdgpu_ps float @test_wwm4(i32 inreg %idx) { ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-W64-NEXT: .LBB14_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB14_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_wwm4: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB14_2 +; GFX10-W32-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB14_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 @@ -601,8 +609,8 @@ define amdgpu_ps float @test_wwm4(i32 inreg %idx) { ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-W32-NEXT: .LBB14_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB14_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -685,16 +693,18 @@ main_body: define amdgpu_ps float @test_wwm6_then() { ; GFX9-W64-LABEL: test_wwm6_then: ; GFX9-W64: ; %bb.0: ; %main_body -; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc @@ -702,22 +712,24 @@ define amdgpu_ps float @test_wwm6_then() { ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-W64-NEXT: .LBB16_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: .LBB16_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_wwm6_then: ; GFX10-W32: ; %bb.0: ; %main_body -; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB16_2 +; GFX10-W32-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc @@ -725,8 +737,8 @@ define amdgpu_ps float @test_wwm6_then() { ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-W32-NEXT: .LBB16_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: .LBB16_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, ptr addrspace(1) undef @@ -771,15 +783,16 @@ define amdgpu_ps float @test_wwm6_loop() { ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 -; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-W64-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-W64-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-W64-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-W64-NEXT: ; %bb.2: ; %endloop -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_wwm6_loop: @@ -798,16 +811,17 @@ define amdgpu_ps float @test_wwm6_loop() { ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX10-W32-NEXT: s_and_b32 s2, s1, -1 +; GFX10-W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX10-W32-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX10-W32-NEXT: ; %bb.2: ; %endloop -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, ptr addrspace(1) undef @@ -965,9 +979,11 @@ define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB21_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec @@ -978,18 +994,20 @@ define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-W64-NEXT: .LBB21_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB21_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wqm3: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB21_2 +; GFX10-W32-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB21_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo @@ -1000,8 +1018,8 @@ define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-W32-NEXT: .LBB21_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB21_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -1032,9 +1050,11 @@ define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec @@ -1044,18 +1064,20 @@ define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-W64-NEXT: .LBB22_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB22_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wqm4: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB22_2 +; GFX10-W32-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB22_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo @@ -1065,8 +1087,8 @@ define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-W32-NEXT: .LBB22_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB22_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -1153,16 +1175,18 @@ define amdgpu_ps float @test_strict_wqm6_then() { ; GFX9-W64-LABEL: test_strict_wqm6_then: ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec @@ -1171,23 +1195,25 @@ define amdgpu_ps float @test_strict_wqm6_then() { ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-W64-NEXT: .LBB24_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: .LBB24_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wqm6_then: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2 +; GFX10-W32-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB24_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo @@ -1196,8 +1222,8 @@ define amdgpu_ps float @test_strict_wqm6_then() { ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-W32-NEXT: .LBB24_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: .LBB24_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, ptr addrspace(1) undef @@ -1244,16 +1270,17 @@ define amdgpu_ps float @test_strict_wqm6_loop() { ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-W64-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 -; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-W64-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_cbranch_execnz .LBB25_1 +; GFX9-W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-W64-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX9-W64-NEXT: ; %bb.2: ; %endloop -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wqm6_loop: @@ -1266,6 +1293,7 @@ define amdgpu_ps float @test_strict_wqm6_loop() { ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: s_mov_b32 s0, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 +; GFX10-W32-NEXT: .p2align 6 ; GFX10-W32-NEXT: .LBB25_1: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo @@ -1275,16 +1303,17 @@ define amdgpu_ps float @test_strict_wqm6_loop() { ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 ; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX10-W32-NEXT: s_and_b32 s2, s1, -1 +; GFX10-W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX10-W32-NEXT: s_cbranch_scc1 .LBB25_1 ; GFX10-W32-NEXT: ; %bb.2: ; %endloop -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, ptr addrspace(1) undef @@ -1365,23 +1394,27 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX9-W64-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-W64-NEXT: s_xor_b64 s[16:17], vcc, exec +; GFX9-W64-NEXT: s_and_b64 s[14:15], vcc, -1 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB27_2 ; GFX9-W64-NEXT: ; %bb.1: ; %ELSE -; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] +; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], s[12:13] ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen ; GFX9-W64-NEXT: ; implicit-def: $vgpr0 -; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15] +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[16:17] ; GFX9-W64-NEXT: .LBB27_2: ; %Flow -; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15] -; GFX9-W64-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-W64-NEXT: s_xor_b64 s[14:15], s[16:17], exec +; GFX9-W64-NEXT: s_and_b64 s[18:19], s[16:17], -1 +; GFX9-W64-NEXT: s_cmov_b64 exec, s[16:17] +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX9-W64-NEXT: ; %bb.3: ; %IF ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 -; GFX9-W64-NEXT: .LBB27_4: ; %END ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB27_4: ; %END ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 @@ -1391,24 +1424,28 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1 -; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 -; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2 +; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_xor_b32 s14, vcc_lo, exec_lo +; GFX10-W32-NEXT: s_and_b32 s13, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB27_2 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE -; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 +; GFX10-W32-NEXT: s_and_saveexec_b32 s13, s12 ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 -; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s14 ; GFX10-W32-NEXT: .LBB27_2: ; %Flow -; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13 -; GFX10-W32-NEXT: s_cbranch_execz .LBB27_4 +; GFX10-W32-NEXT: s_xor_b32 s13, s14, exec_lo +; GFX10-W32-NEXT: s_and_b32 s15, s14, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, s14 +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX10-W32-NEXT: ; %bb.3: ; %IF ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: .LBB27_4: ; %END ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB27_4: ; %END ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 @@ -1441,25 +1478,27 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] -; GFX9-W64-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-W64-NEXT: s_xor_b64 s[14:15], vcc, exec +; GFX9-W64-NEXT: s_and_b64 s[16:17], vcc, -1 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0 +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: .LBB28_2: ; %Flow -; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[14:15] ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] -; GFX9-W64-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_cbranch_execz .LBB28_4 +; GFX9-W64-NEXT: s_xor_b64 s[0:1], s[14:15], exec +; GFX9-W64-NEXT: s_and_b64 s[2:3], s[14:15], -1 +; GFX9-W64-NEXT: s_cmov_b64 exec, s[14:15] +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX9-W64-NEXT: ; %bb.3: ; %ELSE ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen -; GFX9-W64-NEXT: .LBB28_4: ; %END ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: .LBB28_4: ; %END ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -1468,26 +1507,28 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1 -; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 -; GFX10-W32-NEXT: s_cbranch_execz .LBB28_2 +; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo +; GFX10-W32-NEXT: s_and_b32 s14, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB28_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: .LBB28_2: ; %Flow -; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s13 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10-W32-NEXT: s_and_b32 s0, exec_lo, s0 -; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_cbranch_execz .LBB28_4 +; GFX10-W32-NEXT: s_xor_b32 s0, s13, exec_lo +; GFX10-W32-NEXT: s_and_b32 s1, s13, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, s13 +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB28_4 ; GFX10-W32-NEXT: ; %bb.3: ; %ELSE ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen -; GFX10-W32-NEXT: .LBB28_4: ; %END ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: .LBB28_4: ; %END ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: ; return to shader part epilog @@ -1522,23 +1563,31 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3 ; GFX9-W64-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; GFX9-W64-NEXT: s_xor_b64 s[14:15], vcc, exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_and_b64 s[16:17], vcc, -1 ; GFX9-W64-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_wqm_b64 exec, exec -; GFX9-W64-NEXT: s_waitcnt vmcnt(1) -; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; GFX9-W64-NEXT: ; implicit-def: $vgpr0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-W64-NEXT: s_cselect_b32 s16, 1, 0 +; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX9-W64-NEXT: ; %bb.1: ; %ELSE ; GFX9-W64-NEXT: v_lshlrev_b32_e32 v0, 2, v5 ; GFX9-W64-NEXT: ; implicit-def: $vgpr5 -; GFX9-W64-NEXT: ; %bb.2: ; %Flow -; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15] +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB29_2: ; %Flow +; GFX9-W64-NEXT: s_xor_b64 s[16:17], s[14:15], exec +; GFX9-W64-NEXT: s_and_b64 s[18:19], s[14:15], -1 +; GFX9-W64-NEXT: s_cmov_b64 exec, s[14:15] +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX9-W64-NEXT: ; %bb.3: ; %IF ; GFX9-W64-NEXT: v_lshl_add_u32 v0, v5, 1, v5 -; GFX9-W64-NEXT: ; %bb.4: ; %END -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[16:17] +; GFX9-W64-NEXT: .LBB29_4: ; %END ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -1554,21 +1603,29 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3 ; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v0 +; GFX10-W32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_and_b32 s14, vcc_lo, -1 ; GFX10-W32-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 +; GFX10-W32-NEXT: s_cselect_b32 s14, 1, 0 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-W32-NEXT: s_cmp_lg_u32 s14, 0 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX10-W32-NEXT: ; %bb.1: ; %ELSE ; GFX10-W32-NEXT: v_lshlrev_b32_e32 v0, 2, v5 ; GFX10-W32-NEXT: ; implicit-def: $vgpr5 -; GFX10-W32-NEXT: ; %bb.2: ; %Flow -; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB29_2: ; %Flow +; GFX10-W32-NEXT: s_xor_b32 s14, s13, exec_lo +; GFX10-W32-NEXT: s_and_b32 s15, s13, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, s13 +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX10-W32-NEXT: ; %bb.3: ; %IF ; GFX10-W32-NEXT: v_lshl_add_u32 v0, v5, 1, v5 -; GFX10-W32-NEXT: ; %bb.4: ; %END -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s14 +; GFX10-W32-NEXT: .LBB29_4: ; %END ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -1617,29 +1674,27 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: s_xor_b64 s[0:1], vcc, exec +; GFX9-W64-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen ; GFX9-W64-NEXT: ; implicit-def: $vgpr0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9-W64-NEXT: s_cbranch_execnz .LBB30_3 -; GFX9-W64-NEXT: ; %bb.1: ; %Flow -; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9-W64-NEXT: s_cbranch_execnz .LBB30_4 -; GFX9-W64-NEXT: .LBB30_2: ; %END -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: s_branch .LBB30_5 -; GFX9-W64-NEXT: .LBB30_3: ; %ELSE +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB30_2 +; GFX9-W64-NEXT: ; %bb.1: ; %ELSE ; GFX9-W64-NEXT: v_mul_f32_e32 v0, 4.0, v1 ; GFX9-W64-NEXT: ; implicit-def: $vgpr1 -; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9-W64-NEXT: s_cbranch_execz .LBB30_2 -; GFX9-W64-NEXT: .LBB30_4: ; %IF -; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: .LBB30_2: ; %Flow +; GFX9-W64-NEXT: s_xor_b64 s[2:3], s[0:1], exec +; GFX9-W64-NEXT: s_and_b64 s[4:5], s[0:1], -1 +; GFX9-W64-NEXT: s_cmov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB30_4 +; GFX9-W64-NEXT: ; %bb.3: ; %IF +; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB30_4: ; %END ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: s_branch .LBB30_5 -; GFX9-W64-NEXT: .LBB30_5: +; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_control_flow_3: ; GFX10-W32: ; %bb.0: ; %main_body @@ -1650,28 +1705,27 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1 +; GFX10-W32-NEXT: s_xor_b32 s0, vcc_lo, exec_lo +; GFX10-W32-NEXT: s_and_b32 s1, vcc_lo, -1 ; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen -; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: ; implicit-def: $vgpr0 -; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1 -; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-W32-NEXT: s_cbranch_execnz .LBB30_3 -; GFX10-W32-NEXT: ; %bb.1: ; %Flow -; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 -; GFX10-W32-NEXT: s_cbranch_execnz .LBB30_4 -; GFX10-W32-NEXT: .LBB30_2: ; %END -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_branch .LBB30_5 -; GFX10-W32-NEXT: .LBB30_3: ; %ELSE +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB30_2 +; GFX10-W32-NEXT: ; %bb.1: ; %ELSE ; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1 ; GFX10-W32-NEXT: ; implicit-def: $vgpr1 -; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 -; GFX10-W32-NEXT: s_cbranch_execz .LBB30_2 -; GFX10-W32-NEXT: .LBB30_4: ; %IF -; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_branch .LBB30_5 -; GFX10-W32-NEXT: .LBB30_5: +; GFX10-W32-NEXT: .LBB30_2: ; %Flow +; GFX10-W32-NEXT: s_xor_b32 s1, s0, exec_lo +; GFX10-W32-NEXT: s_and_b32 s2, s0, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB30_4 +; GFX10-W32-NEXT: ; %bb.3: ; %IF +; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB30_4: ; %END +; GFX10-W32-NEXT: ; return to shader part epilog main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 %tex0 = extractelement <4 x float> %tex, i32 0 @@ -1702,8 +1756,10 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3 ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB31_2 +; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec +; GFX9-W64-NEXT: s_and_b64 s[16:17], vcc, -1 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB31_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF ; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] ; GFX9-W64-NEXT: buffer_load_dword v1, off, s[0:3], 0 @@ -1711,8 +1767,8 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-W64-NEXT: .LBB31_2: ; %END ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB31_2: ; %END ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -1724,9 +1780,11 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3 ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 -; GFX10-W32-NEXT: s_cbranch_execz .LBB31_2 +; GFX10-W32-NEXT: s_and_b32 s14, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB31_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 ; GFX10-W32-NEXT: buffer_load_dword v1, off, s[0:3], 0 @@ -1734,8 +1792,8 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 -; GFX10-W32-NEXT: .LBB31_2: ; %END ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB31_2: ; %END ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2254,9 +2312,11 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec +; GFX9-W64-NEXT: s_and_b64 s[16:17], vcc, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB40_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2273,8 +2333,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GFX9-W64-NEXT: .LBB40_2: ; %ENDIF ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB40_2: ; %ENDIF ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -2285,8 +2345,10 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB40_2 +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: s_and_b32 s14, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB40_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2303,8 +2365,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GFX10-W32-NEXT: .LBB40_2: ; %ENDIF ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB40_2: ; %ENDIF ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog @@ -2418,9 +2480,11 @@ define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB43_2 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB43_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 @@ -2430,18 +2494,20 @@ define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-W64-NEXT: .LBB43_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB43_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wwm3: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB43_2 +; GFX10-W32-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB43_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 @@ -2451,8 +2517,8 @@ define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-W32-NEXT: .LBB43_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB43_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -2483,9 +2549,11 @@ define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec +; GFX9-W64-NEXT: s_and_b64 s[4:5], vcc, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB44_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 @@ -2494,18 +2562,20 @@ define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-W64-NEXT: .LBB44_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: .LBB44_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wwm4: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB44_2 +; GFX10-W32-NEXT: s_and_b32 s2, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB44_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 @@ -2514,8 +2584,8 @@ define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-W32-NEXT: .LBB44_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: .LBB44_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: ; use mbcnt to make sure the branch is divergent @@ -2598,16 +2668,18 @@ main_body: define amdgpu_ps float @test_strict_wwm6_then() { ; GFX9-W64-LABEL: test_strict_wwm6_then: ; GFX9-W64: ; %bb.0: ; %main_body -; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 +; GFX9-W64-NEXT: s_and_b64 s[2:3], vcc, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB46_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc @@ -2615,22 +2687,24 @@ define amdgpu_ps float @test_strict_wwm6_then() { ; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-W64-NEXT: .LBB46_2: ; %endif ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-W64-NEXT: .LBB46_2: ; %endif ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wwm6_then: ; GFX10-W32: ; %bb.0: ; %main_body -; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2 +; GFX10-W32-NEXT: s_and_b32 s1, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB46_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc @@ -2638,8 +2712,8 @@ define amdgpu_ps float @test_strict_wwm6_then() { ; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-W32-NEXT: .LBB46_2: ; %endif ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-W32-NEXT: .LBB46_2: ; %endif ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, ptr addrspace(1) undef @@ -2680,15 +2754,16 @@ define amdgpu_ps float @test_strict_wwm6_loop() { ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 -; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-W64-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-W64-NEXT: s_and_b64 s[4:5], s[2:3], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_cbranch_execnz .LBB47_1 +; GFX9-W64-NEXT: s_cselect_b64 exec, s[2:3], s[0:1] +; GFX9-W64-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX9-W64-NEXT: ; %bb.2: ; %endloop -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wwm6_loop: @@ -2707,16 +2782,17 @@ define amdgpu_ps float @test_strict_wwm6_loop() { ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_andn2_b32 s1, exec_lo, s0 +; GFX10-W32-NEXT: s_and_b32 s2, s1, -1 +; GFX10-W32-NEXT: s_cselect_b32 exec_lo, s1, s0 +; GFX10-W32-NEXT: s_cbranch_scc1 .LBB47_1 ; GFX10-W32-NEXT: ; %bb.2: ; %endloop -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %src0 = load volatile float, ptr addrspace(1) undef @@ -2790,9 +2866,11 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec +; GFX9-W64-NEXT: s_and_b64 s[16:17], vcc, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB49_2 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB49_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2809,8 +2887,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GFX9-W64-NEXT: .LBB49_2: ; %ENDIF ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB49_2: ; %ENDIF ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -2821,8 +2899,10 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo -; GFX10-W32-NEXT: s_cbranch_execz .LBB49_2 +; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo +; GFX10-W32-NEXT: s_and_b32 s14, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB49_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2839,8 +2919,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GFX10-W32-NEXT: .LBB49_2: ; %ENDIF ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB49_2: ; %ENDIF ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog @@ -2872,11 +2952,13 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec -; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec +; GFX9-W64-NEXT: s_and_b64 s[16:17], vcc, -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX9-W64-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-W64-NEXT: s_cmov_b64 exec, vcc +; GFX9-W64-NEXT: s_cbranch_scc0 .LBB50_2 ; GFX9-W64-NEXT: ; %bb.1: ; %IF ; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2887,8 +2969,8 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-W64-NEXT: .LBB50_2: ; %ENDIF ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-W64-NEXT: .LBB50_2: ; %ENDIF ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-W64-NEXT: ; return to shader part epilog ; @@ -2896,11 +2978,13 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo -; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 -; GFX10-W32-NEXT: s_cbranch_execz .LBB50_2 +; GFX10-W32-NEXT: s_and_b32 s14, vcc_lo, -1 +; GFX10-W32-NEXT: s_cmov_b32 exec_lo, vcc_lo +; GFX10-W32-NEXT: s_cbranch_scc0 .LBB50_2 ; GFX10-W32-NEXT: ; %bb.1: ; %IF ; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2911,8 +2995,8 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX10-W32-NEXT: .LBB50_2: ; %ENDIF ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-W32-NEXT: .LBB50_2: ; %ENDIF ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index e79cb66dcd776..bff88ef8bd663 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -150,83 +150,88 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] -; GFX9-O0-NEXT: s_mov_b32 s40, s6 +; GFX9-O0-NEXT: s_mov_b32 s36, s6 ; GFX9-O0-NEXT: s_mov_b32 s34, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 -; GFX9-O0-NEXT: s_mov_b32 s41, s7 -; GFX9-O0-NEXT: s_mov_b32 s42, s41 -; GFX9-O0-NEXT: s_mov_b32 s43, s40 +; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37 +; GFX9-O0-NEXT: s_mov_b32 s37, s7 +; GFX9-O0-NEXT: s_mov_b32 s38, s37 +; GFX9-O0-NEXT: s_mov_b32 s39, s36 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s5 ; GFX9-O0-NEXT: s_mov_b32 s44, s35 -; GFX9-O0-NEXT: s_mov_b32 s36, s34 -; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s37, s44 -; GFX9-O0-NEXT: s_mov_b32 s38, s43 -; GFX9-O0-NEXT: s_mov_b32 s39, s42 +; GFX9-O0-NEXT: s_mov_b32 s40, s34 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43 +; GFX9-O0-NEXT: s_mov_b32 s41, s44 +; GFX9-O0-NEXT: s_mov_b32 s42, s39 +; GFX9-O0-NEXT: s_mov_b32 s43, s38 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v0, s36, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s37, 1 ; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 2 ; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 3 -; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: s_nop 2 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], s34 +; GFX9-O0-NEXT: s_mov_b32 s36, 0 +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[40:43], s36 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s36 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s36 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 -; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v3, s36 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s36 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 +; GFX9-O0-NEXT: s_mov_b64 s[36:37], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s36, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s37, 5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] -; GFX9-O0-NEXT: s_and_b64 s[34:35], s[34:35], s[36:37] -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 -; GFX9-O0-NEXT: ; %bb.1: ; %if +; GFX9-O0-NEXT: s_and_b64 s[36:37], s[34:35], -1 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[34:35] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9-O0-NEXT: s_branch .LBB1_2 +; GFX9-O0-NEXT: .LBB1_1: ; %if +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s34, v0, 4 +; GFX9-O0-NEXT: v_readlane_b32 s35, v0, 5 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-O0-NEXT: .LBB1_2: ; %merge ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s36, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s37, v0, 5 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-O0-NEXT: v_readlane_b32 s38, v0, 0 ; GFX9-O0-NEXT: v_readlane_b32 s39, v0, 1 ; GFX9-O0-NEXT: v_readlane_b32 s34, v0, 2 @@ -267,23 +272,25 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; GFX9-O3-NEXT: s_mov_b64 s[34:35], exec ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc -; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O3-NEXT: s_and_b64 s[36:37], vcc, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O3-NEXT: s_cmov_b64 exec, vcc +; GFX9-O3-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 @@ -297,9 +304,9 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: .LBB1_2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-O3-NEXT: .LBB1_2: ; %merge +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index def51f2b16d3e..524870bbafd8e 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -146,64 +146,69 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 2 ; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 3 ; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 4 -; GFX9-O0-NEXT: s_mov_b32 s0, 0 -; GFX9-O0-NEXT: s_nop 2 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], s0 +; GFX9-O0-NEXT: s_mov_b32 s2, 0 +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], s2 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 -; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v3, s0 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, s2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[0:1], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 6 +; GFX9-O0-NEXT: s_mov_b64 s[2:3], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 5 +; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 6 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 -; GFX9-O0-NEXT: ; %bb.1: ; %if +; GFX9-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9-O0-NEXT: s_branch .LBB1_2 +; GFX9-O0-NEXT: .LBB1_1: ; %if +; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 5 +; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 6 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-O0-NEXT: .LBB1_2: ; %merge ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 5 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 6 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: v_readlane_b32 s2, v0, 1 ; GFX9-O0-NEXT: v_readlane_b32 s3, v0, 2 ; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 3 @@ -233,23 +238,25 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-LABEL: cfg: ; GFX9-O3: ; %bb.0: ; %entry ; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0 +; GFX9-O3-NEXT: s_mov_b64 s[4:5], exec ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O3-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O3-NEXT: s_cmov_b64 exec, vcc +; GFX9-O3-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 @@ -263,9 +270,9 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: .LBB1_2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-O3-NEXT: .LBB1_2: ; %merge +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 @@ -1016,64 +1023,69 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 2 ; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 3 ; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 4 -; GFX9-O0-NEXT: s_mov_b32 s0, 0 -; GFX9-O0-NEXT: s_nop 2 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], s0 +; GFX9-O0-NEXT: s_mov_b32 s2, 0 +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], s2 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 -; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v3, s0 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, s2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[0:1], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s0, 5 -; GFX9-O0-NEXT: v_writelane_b32 v0, s1, 6 +; GFX9-O0-NEXT: s_mov_b64 s[2:3], exec +; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 5 +; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 6 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-O0-NEXT: s_cbranch_execz .LBB8_2 -; GFX9-O0-NEXT: ; %bb.1: ; %if +; GFX9-O0-NEXT: s_and_b64 s[2:3], s[0:1], -1 +; GFX9-O0-NEXT: s_cmov_b64 exec, s[0:1] +; GFX9-O0-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9-O0-NEXT: s_branch .LBB8_2 +; GFX9-O0-NEXT: .LBB8_1: ; %if +; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 5 +; GFX9-O0-NEXT: v_readlane_b32 s1, v0, 6 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 -; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-O0-NEXT: .LBB8_2: ; %merge ; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s4, v0, 5 -; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 6 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: v_readlane_b32 s2, v0, 1 ; GFX9-O0-NEXT: v_readlane_b32 s3, v0, 2 ; GFX9-O0-NEXT: v_readlane_b32 s0, v0, 3 @@ -1103,23 +1115,25 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-LABEL: strict_wwm_cfg: ; GFX9-O3: ; %bb.0: ; %entry ; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0 +; GFX9-O3-NEXT: s_mov_b64 s[4:5], exec ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-O3-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O3-NEXT: s_and_b64 s[6:7], vcc, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O3-NEXT: s_cmov_b64 exec, vcc +; GFX9-O3-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 @@ -1133,9 +1147,9 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: .LBB8_2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-O3-NEXT: .LBB8_2: ; %merge +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected index d1500e002d7e9..d514cc27206bc 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected @@ -70,7 +70,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s8, s33 +; CHECK-NEXT: s_mov_b32 s10, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_addk_i32 s32, 0x600 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 @@ -78,31 +78,36 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: v_mov_b32_e32 v2, 3 ; CHECK-NEXT: v_mov_b32_e32 v3, 4 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 +; CHECK-NEXT: s_and_b64 s[6:7], s[4:5], exec +; CHECK-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_2: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_4 +; CHECK-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[4:5] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_addk_i32 s32, 0xfa00 -; CHECK-NEXT: s_mov_b32 s33, s8 +; CHECK-NEXT: s_mov_b32 s33, s10 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected index deadc4adb02c5..ecfb1f138153a 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected @@ -11,7 +11,7 @@ define dso_local i32 @check_boundaries() #0 { ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s8, s33 +; CHECK-NEXT: s_mov_b32 s10, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_addk_i32 s32, 0x600 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 @@ -19,31 +19,36 @@ define dso_local i32 @check_boundaries() #0 { ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: v_mov_b32_e32 v2, 3 ; CHECK-NEXT: v_mov_b32_e32 v3, 4 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 +; CHECK-NEXT: s_and_b64 s[6:7], s[4:5], exec +; CHECK-NEXT: s_xor_b64 s[4:5], s[6:7], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[6:7], -1 ; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: s_cmov_b64 exec, s[6:7] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_2: ; %Flow -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_4 +; CHECK-NEXT: s_xor_b64 s[6:7], s[4:5], exec +; CHECK-NEXT: s_and_b64 s[8:9], s[4:5], -1 +; CHECK-NEXT: s_cmov_b64 exec, s[4:5] +; CHECK-NEXT: s_cbranch_scc0 .LBB0_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_addk_i32 s32, 0xfa00 -; CHECK-NEXT: s_mov_b32 s33, s8 +; CHECK-NEXT: s_mov_b32 s33, s10 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %1 = alloca i32, align 4, addrspace(5)