diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index e889c5e75d190..0bf370bb745c4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7333,44 +7333,12 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, return CreatedBB; } - // Legalize REG_SEQUENCE and PHI - // The register class of the operands much be the same type as the register + // Legalize PHI + // The register class of the operands must be the same type as the register // class of the output. if (MI.getOpcode() == AMDGPU::PHI) { - const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; - for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { - if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual()) - continue; - const TargetRegisterClass *OpRC = - MRI.getRegClass(MI.getOperand(i).getReg()); - if (RI.hasVectorRegisters(OpRC)) { - VRC = OpRC; - } else { - SRC = OpRC; - } - } - - // If any of the operands are VGPR registers, then they all most be - // otherwise we will create illegal VGPR->SGPR copies when legalizing - // them. - if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { - if (!VRC) { - assert(SRC); - if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { - VRC = &AMDGPU::VReg_1RegClass; - } else - VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) - ? RI.getEquivalentAGPRClass(SRC) - : RI.getEquivalentVGPRClass(SRC); - } else { - VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) - ? RI.getEquivalentAGPRClass(VRC) - : RI.getEquivalentVGPRClass(VRC); - } - RC = VRC; - } else { - RC = SRC; - } + const TargetRegisterClass *VRC = getOpRegClass(MI, 0); + assert(!RI.isSGPRClass(VRC)); // Update all the operands so they have the same type. for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { @@ -7384,7 +7352,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, // Avoid creating no-op copies with the same src and dst reg class. These // confuse some of the machine passes. - legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); + legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc()); } } diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index b8962fa29e8f1..b5f952b0bb00d 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -926,12 +926,12 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -939,23 +939,23 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB14_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB14_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB14_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB14_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) @@ -1016,12 +1016,12 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1029,23 +1029,23 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB15_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB15_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB15_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB15_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) @@ -1294,12 +1294,12 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1307,23 +1307,23 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB18_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB18_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB18_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(2) @@ -6406,35 +6406,35 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_add_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB90_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB90_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB90_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v0, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB90_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6591,35 +6591,35 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_sub_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB92_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB92_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB92_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 -; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc -; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB92_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8881,28 +8881,28 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[6:7] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB114_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB114_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6 -; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] @@ -8911,20 +8911,20 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execnz .LBB114_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB114_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB114_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc ; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v6 +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc +; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc ; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen @@ -9027,28 +9027,29 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB115_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB115_3: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB115_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2 -; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX90A-NEXT: .LBB115_5: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2 ; GFX90A-NEXT: .LBB115_6: ; %Flow2 @@ -9065,6 +9066,7 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f32_ret_a_a: @@ -9827,31 +9829,33 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB127_3 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX90A-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB127_3: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB127_5 ; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB127_5: ; %Flow1 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB127_6: ; %Flow2 @@ -9869,6 +9873,7 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a: @@ -9890,30 +9895,32 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v1 -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB127_3 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX950-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off sc0 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB127_3: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB127_5 ; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB127_5: ; %Flow1 ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v4 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v5 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB127_6: ; %Flow2 @@ -9932,6 +9939,7 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() @@ -10399,31 +10407,31 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmax_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB132_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB132_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB132_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] @@ -10587,31 +10595,31 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmin_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB134_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB134_2: ; %Flow ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB134_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] @@ -14430,30 +14438,30 @@ define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB194_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB194_3 ; GFX90A-NEXT: s_branch .LBB194_4 ; GFX90A-NEXT: .LBB194_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB194_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB194_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14467,27 +14475,27 @@ define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB194_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_cbranch_execz .LBB194_3 ; GFX950-NEXT: s_branch .LBB194_4 ; GFX950-NEXT: .LBB194_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB194_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB194_4: ; %atomicrmw.end ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 @@ -14604,32 +14612,32 @@ define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB196_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB196_3 ; GFX90A-NEXT: s_branch .LBB196_4 ; GFX90A-NEXT: .LBB196_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB196_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB196_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14643,28 +14651,28 @@ define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB196_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_cbranch_execz .LBB196_3 ; GFX950-NEXT: s_branch .LBB196_4 ; GFX950-NEXT: .LBB196_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB196_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB196_4: ; %atomicrmw.end ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 @@ -14783,32 +14791,32 @@ define void @flat_atomic_sub_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB198_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB198_3 ; GFX90A-NEXT: s_branch .LBB198_4 ; GFX90A-NEXT: .LBB198_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB198_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 -; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB198_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -14963,32 +14971,32 @@ define void @flat_atomic_and_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB200_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB200_3 ; GFX90A-NEXT: s_branch .LBB200_4 ; GFX90A-NEXT: .LBB200_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB200_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_and_b32_e32 v3, v1, v3 -; GFX90A-NEXT: v_and_b32_e32 v2, v0, v2 -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB200_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15392,32 +15400,32 @@ define void @flat_atomic_or_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB204_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB204_3 ; GFX90A-NEXT: s_branch .LBB204_4 ; GFX90A-NEXT: .LBB204_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB204_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX90A-NEXT: v_or_b32_e32 v2, v0, v2 -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX90A-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB204_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15571,32 +15579,32 @@ define void @flat_atomic_xor_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB206_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB206_3 ; GFX90A-NEXT: s_branch .LBB206_4 ; GFX90A-NEXT: .LBB206_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB206_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 -; GFX90A-NEXT: v_xor_b32_e32 v2, v0, v2 -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX90A-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB206_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15753,33 +15761,33 @@ define void @flat_atomic_max_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB208_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB208_3 ; GFX90A-NEXT: s_branch .LBB208_4 ; GFX90A-NEXT: .LBB208_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB208_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB208_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -15938,33 +15946,33 @@ define void @flat_atomic_min_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB210_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB210_3 ; GFX90A-NEXT: s_branch .LBB210_4 ; GFX90A-NEXT: .LBB210_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB210_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB210_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -16123,33 +16131,33 @@ define void @flat_atomic_umax_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB212_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB212_3 ; GFX90A-NEXT: s_branch .LBB212_4 ; GFX90A-NEXT: .LBB212_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB212_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB212_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -16308,33 +16316,33 @@ define void @flat_atomic_umin_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB214_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB214_3 ; GFX90A-NEXT: s_branch .LBB214_4 ; GFX90A-NEXT: .LBB214_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB214_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB214_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -16689,37 +16697,37 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB218_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_execz .LBB218_3 ; GFX90A-NEXT: s_branch .LBB218_4 ; GFX90A-NEXT: .LBB218_2: -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB218_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1] ; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB218_4: ; %atomicrmw.end ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17310,51 +17318,49 @@ define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: s_cbranch_vccz .LBB223_4 +; GFX90A-NEXT: s_cbranch_vccz .LBB223_3 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_vccz .LBB223_7 +; GFX90A-NEXT: s_cbranch_vccz .LBB223_4 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v0, s[4:5] glc -; GFX90A-NEXT: s_cbranch_execz .LBB223_8 -; GFX90A-NEXT: ; %bb.3: ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: s_cbranch_execz .LBB223_5 ; GFX90A-NEXT: s_branch .LBB223_6 +; GFX90A-NEXT: .LBB223_3: +; GFX90A-NEXT: ; implicit-def: $agpr0 +; GFX90A-NEXT: s_branch .LBB223_7 ; GFX90A-NEXT: .LBB223_4: ; GFX90A-NEXT: ; implicit-def: $agpr0 -; GFX90A-NEXT: .LBB223_5: ; %atomicrmw.shared +; GFX90A-NEXT: .LBB223_5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f32_e32 v3, v2, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX90A-NEXT: .LBB223_6: ; %Flow1 +; GFX90A-NEXT: s_cbranch_execnz .LBB223_8 +; GFX90A-NEXT: .LBB223_7: ; %atomicrmw.shared ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-NEXT: ds_add_rtn_f32 v0, v1, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: .LBB223_6: ; %atomicrmw.end +; GFX90A-NEXT: .LBB223_8: ; %atomicrmw.end ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB223_7: -; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: .LBB223_8: ; %atomicrmw.private -; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 -; GFX90A-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v3, v2, v0 -; GFX90A-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: s_cbranch_execz .LBB223_5 -; GFX90A-NEXT: s_branch .LBB223_6 ; ; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a: ; GFX950: ; %bb.0: @@ -18162,13 +18168,16 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execz .LBB235_5 ; GFX90A-NEXT: s_branch .LBB235_6 ; GFX90A-NEXT: .LBB235_3: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: s_branch .LBB235_7 ; GFX90A-NEXT: .LBB235_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX90A-NEXT: .LBB235_5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 @@ -18176,13 +18185,12 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB235_6: ; %Flow1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_cbranch_execnz .LBB235_8 ; GFX90A-NEXT: .LBB235_7: ; %atomicrmw.shared ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -18196,6 +18204,7 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a: @@ -18222,24 +18231,26 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execz .LBB235_5 ; GFX950-NEXT: s_branch .LBB235_6 ; GFX950-NEXT: .LBB235_3: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_branch .LBB235_7 ; GFX950-NEXT: .LBB235_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: .LBB235_5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s2, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2 ; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[4:5], s2 ; GFX950-NEXT: .LBB235_6: ; %Flow1 -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_cbranch_execnz .LBB235_8 ; GFX950-NEXT: .LBB235_7: ; %atomicrmw.shared ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -18253,6 +18264,7 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %data = call double asm "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll index 4bc6220b4d9a0..42f76c4a10d2a 100644 --- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll @@ -48,17 +48,16 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: .LBB0_1: ; %Flow9 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_andn2_b64 vcc, exec, s[24:25] -; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[24:25] -; CHECK-NEXT: s_cbranch_vccz .LBB0_18 +; CHECK-NEXT: s_cbranch_vccz .LBB0_17 ; CHECK-NEXT: .LBB0_2: ; %._crit_edge1942.i.i.i3548 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 -; CHECK-NEXT: ; Child Loop BB0_7 Depth 2 +; CHECK-NEXT: ; Child Loop BB0_6 Depth 2 ; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_11 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_9 ; CHECK-NEXT: ; %bb.3: ; %.preheader1868.i.i.i3244 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 vcc, s[4:5] -; CHECK-NEXT: s_cbranch_vccz .LBB0_12 +; CHECK-NEXT: s_cbranch_vccz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %.preheader1855.i.i.i3329.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15] @@ -86,54 +85,49 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] ; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[18:19] ; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] -; CHECK-NEXT: s_branch .LBB0_7 -; CHECK-NEXT: .LBB0_5: ; in Loop: Header=BB0_7 Depth=2 -; CHECK-NEXT: s_mov_b64 s[24:25], -1 -; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 -; CHECK-NEXT: s_mov_b64 s[8:9], -1 -; CHECK-NEXT: .LBB0_6: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=2 +; CHECK-NEXT: s_branch .LBB0_6 +; CHECK-NEXT: .LBB0_5: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 ; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_13 -; CHECK-NEXT: .LBB0_7: ; %.preheader1855.i.i.i3329 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_11 +; CHECK-NEXT: .LBB0_6: ; %.preheader1855.i.i.i3329 ; CHECK-NEXT: ; Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_accvgpr_read_b32 v27, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v26, a0 +; CHECK-NEXT: s_mov_b64 s[24:25], -1 +; CHECK-NEXT: s_mov_b64 s[8:9], -1 ; CHECK-NEXT: s_mov_b64 vcc, s[2:3] +; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 -; CHECK-NEXT: ; %bb.8: ; %.lr.ph2070.i.i.i3291 -; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=2 +; CHECK-NEXT: ; %bb.7: ; %.lr.ph2070.i.i.i3291 +; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v30 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v31 +; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19] ; CHECK-NEXT: s_mov_b64 vcc, s[6:7] -; CHECK-NEXT: s_cbranch_vccz .LBB0_10 -; CHECK-NEXT: ; %bb.9: ; %.preheader1856.preheader.i.i.i3325 -; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=2 +; CHECK-NEXT: s_cbranch_vccz .LBB0_5 +; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325 +; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v28 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 ; CHECK-NEXT: v_accvgpr_write_b32 a1, v29 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: s_branch .LBB0_6 -; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_7 Depth=2 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v30 -; CHECK-NEXT: s_mov_b64 s[24:25], -1 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v31 -; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19] -; CHECK-NEXT: s_branch .LBB0_6 -; CHECK-NEXT: .LBB0_11: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[10:11] +; CHECK-NEXT: s_branch .LBB0_5 +; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[22:23], 0 +; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11] ; CHECK-NEXT: s_mov_b64 s[8:9], s[20:21] -; CHECK-NEXT: s_branch .LBB0_16 -; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_branch .LBB0_15 +; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 ; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0 -; CHECK-NEXT: v_mov_b64_e32 v[24:25], v[30:31] -; CHECK-NEXT: s_branch .LBB0_16 -; CHECK-NEXT: .LBB0_13: ; %loop.exit.guard +; CHECK-NEXT: s_branch .LBB0_15 +; CHECK-NEXT: .LBB0_11: ; %loop.exit.guard ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_and_b64 vcc, exec, s[24:25] -; CHECK-NEXT: s_cbranch_vccz .LBB0_15 -; CHECK-NEXT: ; %bb.14: ; %._crit_edge2105.i.i.i2330.loopexit +; CHECK-NEXT: s_cbranch_vccz .LBB0_13 +; CHECK-NEXT: ; %bb.12: ; %._crit_edge2105.i.i.i2330.loopexit ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[26:27] ; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17] @@ -145,21 +139,24 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_cselect_b32 s23, s23, 0 ; CHECK-NEXT: s_cselect_b32 s22, s22, 0 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 -; CHECK-NEXT: s_branch .LBB0_16 -; CHECK-NEXT: .LBB0_15: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_branch .LBB0_14 +; CHECK-NEXT: .LBB0_13: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0 -; CHECK-NEXT: .LBB0_16: ; %Flow6 +; CHECK-NEXT: .LBB0_14: ; %Flow6 +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[24:25] +; CHECK-NEXT: .LBB0_15: ; %Flow6 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[24:25], -1 ; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] ; CHECK-NEXT: s_cbranch_vccz .LBB0_1 -; CHECK-NEXT: ; %bb.17: ; %._crit_edge2105.i.i.i2330 +; CHECK-NEXT: ; %bb.16: ; %._crit_edge2105.i.i.i2330 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 ; CHECK-NEXT: global_store_dwordx2 v20, v[20:21], s[12:13] ; CHECK-NEXT: s_branch .LBB0_1 -; CHECK-NEXT: .LBB0_18: ; %DummyReturnBlock +; CHECK-NEXT: .LBB0_17: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm entry: br label %._crit_edge1942.i.i.i3548 diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 9e7baa18bf5ba..eb8ea50646f78 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -467,6 +467,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -488,12 +489,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr68_sgpr69, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc @@ -508,7 +509,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr66_sgpr67, $sgpr68_sgpr69 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr1, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, $vcc, 0, implicit $exec @@ -538,17 +539,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc @@ -560,8 +561,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr20, 16, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr66_sgpr67, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr18_sgpr19, implicit-def dead $scc @@ -605,7 +606,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr68_sgpr69, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -614,7 +615,6 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i51) ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec @@ -646,7 +646,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc @@ -655,7 +655,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc @@ -669,7 +669,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec @@ -698,7 +698,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: dead renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr10 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec @@ -711,7 +711,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF @@ -726,7 +726,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr3 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76) ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec @@ -758,9 +758,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr30 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr66_sgpr67, implicit $exec + ; GFX90A-NEXT: renamable $vgpr30 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr7 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr7, 0, 0, implicit $exec :: (load (s64) from %ir.4, addrspace 3) @@ -772,7 +772,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr7 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = V_LSHRREV_B64_e64 1, $vgpr22_vgpr23, implicit $exec ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr16 = COPY renamable $vgpr22, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} @@ -832,14 +832,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr12 = V_OR_B32_e32 1, $vgpr10, implicit $exec ; GFX90A-NEXT: renamable $vgpr13 = COPY renamable $vgpr11, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = FLAT_LOAD_UBYTE renamable $vgpr12_vgpr13, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr3, implicit $exec - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF @@ -854,20 +854,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.60.Flow31: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.61.Flow30: ; GFX90A-NEXT: successors: %bb.55(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr7, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr52_sgpr53, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.55 ; GFX90A-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 9778c61c44e6e..a3c72ce6d4b37 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -6952,19 +6952,19 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-LABEL: local_atomic_fadd_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v4, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v4 ; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -6976,11 +6976,11 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7313,19 +7313,19 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-LABEL: local_atomic_fadd_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v4, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v4 ; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -7337,11 +7337,11 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 offset:65532 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 91add012bdcfa..f144d5636765c 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -6715,19 +6715,19 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-LABEL: local_atomic_fmax_ret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v4, v0 +; GFX942-NEXT: ds_read_b32 v3, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX942-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v4 ; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -6740,11 +6740,11 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] ; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB24_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6887,19 +6887,19 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-LABEL: local_atomic_fmax_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v4, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX90A-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v4 ; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -6911,11 +6911,11 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7199,19 +7199,19 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-LABEL: local_atomic_fmax_ret_v2bf16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v4, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX942-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX942-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX942-NEXT: v_max_f32_e32 v5, v5, v4 ; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -7224,11 +7224,11 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] ; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 offset:65532 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB25_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7371,19 +7371,19 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-LABEL: local_atomic_fmax_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v4, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX90A-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v4 ; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -7395,11 +7395,11 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 offset:65532 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 8597c2e256584..05558e573f5f9 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -6715,19 +6715,19 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-LABEL: local_atomic_fmin_ret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v4, v0 +; GFX942-NEXT: ds_read_b32 v3, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX942-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v4 ; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -6740,11 +6740,11 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] ; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB24_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6887,19 +6887,19 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-LABEL: local_atomic_fmin_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v4, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX90A-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v4 ; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -6911,11 +6911,11 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7199,19 +7199,19 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-LABEL: local_atomic_fmin_ret_v2bf16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v4, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX942-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX942-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX942-NEXT: v_min_f32_e32 v5, v5, v4 ; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -7224,11 +7224,11 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] ; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 offset:65532 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB25_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7371,19 +7371,19 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-LABEL: local_atomic_fmin_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v4, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX90A-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v4 ; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -7395,11 +7395,11 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 offset:65532 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 290d3117cac9a..c2f6fb079d99e 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -7483,19 +7483,19 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-LABEL: local_atomic_fsub_ret_v2bf16: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v4, v0 +; GFX942-NEXT: ds_read_b32 v3, v0 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX942-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v4 ; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -7508,11 +7508,11 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] ; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB24_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7655,19 +7655,19 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-LABEL: local_atomic_fsub_ret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v4, v0 +; GFX90A-NEXT: ds_read_b32 v3, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX90A-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v4 ; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -7679,11 +7679,11 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7967,19 +7967,19 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-LABEL: local_atomic_fsub_ret_v2bf16__offset: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ds_read_b32 v4, v0 offset:65532 +; GFX942-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX942-NEXT: s_movk_i32 s4, 0x7fff -; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX942-NEXT: s_mov_b32 s5, 0x7060302 ; GFX942-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX942-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX942-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX942-NEXT: v_sub_f32_e32 v5, v5, v4 ; GFX942-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX942-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -7992,11 +7992,11 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX942-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[0:1] ; GFX942-NEXT: v_perm_b32 v1, v5, v1, s5 -; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 offset:65532 +; GFX942-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB25_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8139,19 +8139,19 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-LABEL: local_atomic_fsub_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v4, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX90A-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v4 ; GFX90A-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v1 @@ -8163,11 +8163,11 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 offset:65532 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v3, v1 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll index 789eb8e480214..053cf0e1c6906 100644 --- a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll +++ b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll @@ -217,7 +217,6 @@ define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg noc ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, 0 @@ -237,7 +236,7 @@ define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg noc ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB8_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load -; GFX942-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] +; GFX942-NEXT: global_load_dwordx4 v[16:19], v16, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v19 ; GFX942-NEXT: v_lshrrev_b32_e32 v14, 16, v19 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 8d2432295dacb..26c4830dffffb 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -101,41 +101,41 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_zeroinit: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v11, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v13, 0 -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v15, 0 -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v17, 0 -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_mov_b32_e32 v20, 0 -; GFX90A-NEXT: v_mov_b32_e32 v21, 0 -; GFX90A-NEXT: v_mov_b32_e32 v22, 0 -; GFX90A-NEXT: v_mov_b32_e32 v23, 0 -; GFX90A-NEXT: v_mov_b32_e32 v24, 0 -; GFX90A-NEXT: v_mov_b32_e32 v25, 0 -; GFX90A-NEXT: v_mov_b32_e32 v26, 0 -; GFX90A-NEXT: v_mov_b32_e32 v27, 0 -; GFX90A-NEXT: v_mov_b32_e32 v28, 0 -; GFX90A-NEXT: v_mov_b32_e32 v29, 0 -; GFX90A-NEXT: v_mov_b32_e32 v30, 0 -; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -160,41 +160,41 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_zeroinit: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v9, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v11, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v13, 0 -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v15, 0 -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v17, 0 -; GFX942-NEXT: v_mov_b32_e32 v18, 0 -; GFX942-NEXT: v_mov_b32_e32 v19, 0 -; GFX942-NEXT: v_mov_b32_e32 v20, 0 -; GFX942-NEXT: v_mov_b32_e32 v21, 0 -; GFX942-NEXT: v_mov_b32_e32 v22, 0 -; GFX942-NEXT: v_mov_b32_e32 v23, 0 -; GFX942-NEXT: v_mov_b32_e32 v24, 0 -; GFX942-NEXT: v_mov_b32_e32 v25, 0 -; GFX942-NEXT: v_mov_b32_e32 v26, 0 -; GFX942-NEXT: v_mov_b32_e32 v27, 0 -; GFX942-NEXT: v_mov_b32_e32 v28, 0 -; GFX942-NEXT: v_mov_b32_e32 v29, 0 -; GFX942-NEXT: v_mov_b32_e32 v30, 0 -; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v0 +; GFX942-NEXT: v_mov_b32_e32 v18, v0 +; GFX942-NEXT: v_mov_b32_e32 v19, v0 +; GFX942-NEXT: v_mov_b32_e32 v20, v0 +; GFX942-NEXT: v_mov_b32_e32 v21, v0 +; GFX942-NEXT: v_mov_b32_e32 v22, v0 +; GFX942-NEXT: v_mov_b32_e32 v23, v0 +; GFX942-NEXT: v_mov_b32_e32 v24, v0 +; GFX942-NEXT: v_mov_b32_e32 v25, v0 +; GFX942-NEXT: v_mov_b32_e32 v26, v0 +; GFX942-NEXT: v_mov_b32_e32 v27, v0 +; GFX942-NEXT: v_mov_b32_e32 v28, v0 +; GFX942-NEXT: v_mov_b32_e32 v29, v0 +; GFX942-NEXT: v_mov_b32_e32 v30, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, v0 ; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -558,40 +558,40 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX90A-LABEL: test_mfma_loop_non_splat: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v11, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v13, 0 -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v15, 0 -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v17, 0 -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_mov_b32_e32 v20, 0 -; GFX90A-NEXT: v_mov_b32_e32 v21, 0 -; GFX90A-NEXT: v_mov_b32_e32 v22, 0 -; GFX90A-NEXT: v_mov_b32_e32 v23, 0 -; GFX90A-NEXT: v_mov_b32_e32 v24, 0 -; GFX90A-NEXT: v_mov_b32_e32 v25, 0 -; GFX90A-NEXT: v_mov_b32_e32 v26, 0 -; GFX90A-NEXT: v_mov_b32_e32 v27, 0 -; GFX90A-NEXT: v_mov_b32_e32 v28, 0 -; GFX90A-NEXT: v_mov_b32_e32 v29, 0 -; GFX90A-NEXT: v_mov_b32_e32 v30, 0 -; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v33, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v32 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -617,40 +617,40 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX942-LABEL: test_mfma_loop_non_splat: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v9, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v11, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v13, 0 -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v15, 0 -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v17, 0 -; GFX942-NEXT: v_mov_b32_e32 v18, 0 -; GFX942-NEXT: v_mov_b32_e32 v19, 0 -; GFX942-NEXT: v_mov_b32_e32 v20, 0 -; GFX942-NEXT: v_mov_b32_e32 v21, 0 -; GFX942-NEXT: v_mov_b32_e32 v22, 0 -; GFX942-NEXT: v_mov_b32_e32 v23, 0 -; GFX942-NEXT: v_mov_b32_e32 v24, 0 -; GFX942-NEXT: v_mov_b32_e32 v25, 0 -; GFX942-NEXT: v_mov_b32_e32 v26, 0 -; GFX942-NEXT: v_mov_b32_e32 v27, 0 -; GFX942-NEXT: v_mov_b32_e32 v28, 0 -; GFX942-NEXT: v_mov_b32_e32 v29, 0 -; GFX942-NEXT: v_mov_b32_e32 v30, 0 -; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v33, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v32 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v0 +; GFX942-NEXT: v_mov_b32_e32 v18, v0 +; GFX942-NEXT: v_mov_b32_e32 v19, v0 +; GFX942-NEXT: v_mov_b32_e32 v20, v0 +; GFX942-NEXT: v_mov_b32_e32 v21, v0 +; GFX942-NEXT: v_mov_b32_e32 v22, v0 +; GFX942-NEXT: v_mov_b32_e32 v23, v0 +; GFX942-NEXT: v_mov_b32_e32 v24, v0 +; GFX942-NEXT: v_mov_b32_e32 v25, v0 +; GFX942-NEXT: v_mov_b32_e32 v26, v0 +; GFX942-NEXT: v_mov_b32_e32 v27, v0 +; GFX942-NEXT: v_mov_b32_e32 v28, v0 +; GFX942-NEXT: v_mov_b32_e32 v29, v0 +; GFX942-NEXT: v_mov_b32_e32 v30, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, v0 ; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -819,39 +819,39 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_seq: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f80000 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x42fa0000 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x42fc0000 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0x42fe0000 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0x43000000 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x43010000 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0x43020000 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0x43030000 -; GFX90A-NEXT: v_mov_b32_e32 v9, 0x43040000 -; GFX90A-NEXT: v_mov_b32_e32 v10, 0x43050000 -; GFX90A-NEXT: v_mov_b32_e32 v11, 0x43060000 -; GFX90A-NEXT: v_mov_b32_e32 v12, 0x43070000 -; GFX90A-NEXT: v_mov_b32_e32 v13, 0x43080000 -; GFX90A-NEXT: v_mov_b32_e32 v14, 0x43090000 -; GFX90A-NEXT: v_mov_b32_e32 v15, 0x430a0000 -; GFX90A-NEXT: v_mov_b32_e32 v16, 0x430b0000 -; GFX90A-NEXT: v_mov_b32_e32 v17, 0x430c0000 -; GFX90A-NEXT: v_mov_b32_e32 v18, 0x430d0000 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0x430e0000 -; GFX90A-NEXT: v_mov_b32_e32 v20, 0x430f0000 -; GFX90A-NEXT: v_mov_b32_e32 v21, 0x43100000 -; GFX90A-NEXT: v_mov_b32_e32 v22, 0x43110000 -; GFX90A-NEXT: v_mov_b32_e32 v23, 0x43120000 -; GFX90A-NEXT: v_mov_b32_e32 v24, 0x43130000 -; GFX90A-NEXT: v_mov_b32_e32 v25, 0x43140000 -; GFX90A-NEXT: v_mov_b32_e32 v26, 0x43150000 -; GFX90A-NEXT: v_mov_b32_e32 v27, 0x43160000 -; GFX90A-NEXT: v_mov_b32_e32 v28, 0x43170000 -; GFX90A-NEXT: v_mov_b32_e32 v29, 0x43180000 -; GFX90A-NEXT: v_mov_b32_e32 v30, 0x43190000 ; GFX90A-NEXT: v_mov_b32_e32 v31, 0x431a0000 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0x43190000 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0x43180000 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0x43170000 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0x43160000 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0x43150000 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0x43140000 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0x43130000 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0x43120000 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0x43110000 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0x43100000 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0x430f0000 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0x430e0000 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0x430d0000 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0x430c0000 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0x430b0000 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0x430a0000 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0x43090000 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0x43080000 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0x43070000 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0x43060000 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0x43050000 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0x43040000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x43030000 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x43020000 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x43010000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x43000000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x42fe0000 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x42fc0000 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x42fa0000 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f80000 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 ; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader @@ -878,39 +878,39 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; ; GFX942-LABEL: test_mfma_loop_unfoldable_seq: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f80000 -; GFX942-NEXT: v_mov_b32_e32 v2, 0x42fa0000 -; GFX942-NEXT: v_mov_b32_e32 v3, 0x42fc0000 -; GFX942-NEXT: v_mov_b32_e32 v4, 0x42fe0000 -; GFX942-NEXT: v_mov_b32_e32 v5, 0x43000000 -; GFX942-NEXT: v_mov_b32_e32 v6, 0x43010000 -; GFX942-NEXT: v_mov_b32_e32 v7, 0x43020000 -; GFX942-NEXT: v_mov_b32_e32 v8, 0x43030000 -; GFX942-NEXT: v_mov_b32_e32 v9, 0x43040000 -; GFX942-NEXT: v_mov_b32_e32 v10, 0x43050000 -; GFX942-NEXT: v_mov_b32_e32 v11, 0x43060000 -; GFX942-NEXT: v_mov_b32_e32 v12, 0x43070000 -; GFX942-NEXT: v_mov_b32_e32 v13, 0x43080000 -; GFX942-NEXT: v_mov_b32_e32 v14, 0x43090000 -; GFX942-NEXT: v_mov_b32_e32 v15, 0x430a0000 -; GFX942-NEXT: v_mov_b32_e32 v16, 0x430b0000 -; GFX942-NEXT: v_mov_b32_e32 v17, 0x430c0000 -; GFX942-NEXT: v_mov_b32_e32 v18, 0x430d0000 -; GFX942-NEXT: v_mov_b32_e32 v19, 0x430e0000 -; GFX942-NEXT: v_mov_b32_e32 v20, 0x430f0000 -; GFX942-NEXT: v_mov_b32_e32 v21, 0x43100000 -; GFX942-NEXT: v_mov_b32_e32 v22, 0x43110000 -; GFX942-NEXT: v_mov_b32_e32 v23, 0x43120000 -; GFX942-NEXT: v_mov_b32_e32 v24, 0x43130000 -; GFX942-NEXT: v_mov_b32_e32 v25, 0x43140000 -; GFX942-NEXT: v_mov_b32_e32 v26, 0x43150000 -; GFX942-NEXT: v_mov_b32_e32 v27, 0x43160000 -; GFX942-NEXT: v_mov_b32_e32 v28, 0x43170000 -; GFX942-NEXT: v_mov_b32_e32 v29, 0x43180000 -; GFX942-NEXT: v_mov_b32_e32 v30, 0x43190000 ; GFX942-NEXT: v_mov_b32_e32 v31, 0x431a0000 +; GFX942-NEXT: v_mov_b32_e32 v30, 0x43190000 +; GFX942-NEXT: v_mov_b32_e32 v29, 0x43180000 +; GFX942-NEXT: v_mov_b32_e32 v28, 0x43170000 +; GFX942-NEXT: v_mov_b32_e32 v27, 0x43160000 +; GFX942-NEXT: v_mov_b32_e32 v26, 0x43150000 +; GFX942-NEXT: v_mov_b32_e32 v25, 0x43140000 +; GFX942-NEXT: v_mov_b32_e32 v24, 0x43130000 +; GFX942-NEXT: v_mov_b32_e32 v23, 0x43120000 +; GFX942-NEXT: v_mov_b32_e32 v22, 0x43110000 +; GFX942-NEXT: v_mov_b32_e32 v21, 0x43100000 +; GFX942-NEXT: v_mov_b32_e32 v20, 0x430f0000 +; GFX942-NEXT: v_mov_b32_e32 v19, 0x430e0000 +; GFX942-NEXT: v_mov_b32_e32 v18, 0x430d0000 +; GFX942-NEXT: v_mov_b32_e32 v17, 0x430c0000 +; GFX942-NEXT: v_mov_b32_e32 v16, 0x430b0000 +; GFX942-NEXT: v_mov_b32_e32 v15, 0x430a0000 +; GFX942-NEXT: v_mov_b32_e32 v14, 0x43090000 +; GFX942-NEXT: v_mov_b32_e32 v13, 0x43080000 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x43070000 +; GFX942-NEXT: v_mov_b32_e32 v11, 0x43060000 +; GFX942-NEXT: v_mov_b32_e32 v10, 0x43050000 +; GFX942-NEXT: v_mov_b32_e32 v9, 0x43040000 +; GFX942-NEXT: v_mov_b32_e32 v8, 0x43030000 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x43020000 +; GFX942-NEXT: v_mov_b32_e32 v6, 0x43010000 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x43000000 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x42fe0000 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x42fc0000 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x42fa0000 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f80000 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 ; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader @@ -1574,42 +1574,42 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-LABEL: test_mfma_loop_mixed_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v11, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v13, 0 -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v15, 0 -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v17, 0 -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_mov_b32_e32 v20, 0 -; GFX90A-NEXT: v_mov_b32_e32 v21, 0 -; GFX90A-NEXT: v_mov_b32_e32 v22, 0 -; GFX90A-NEXT: v_mov_b32_e32 v23, 0 -; GFX90A-NEXT: v_mov_b32_e32 v24, 0 -; GFX90A-NEXT: v_mov_b32_e32 v25, 0 -; GFX90A-NEXT: v_mov_b32_e32 v26, 0 -; GFX90A-NEXT: v_mov_b32_e32 v27, 0 -; GFX90A-NEXT: v_mov_b32_e32 v28, 0 -; GFX90A-NEXT: v_mov_b32_e32 v29, 0 -; GFX90A-NEXT: v_mov_b32_e32 v30, 0 -; GFX90A-NEXT: v_mov_b32_e32 v31, 0 -; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: v_mov_b32_e32 v12, v2 +; GFX90A-NEXT: v_mov_b32_e32 v13, v2 +; GFX90A-NEXT: v_mov_b32_e32 v14, v2 +; GFX90A-NEXT: v_mov_b32_e32 v15, v2 +; GFX90A-NEXT: v_mov_b32_e32 v16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v17, v2 +; GFX90A-NEXT: v_mov_b32_e32 v18, v2 +; GFX90A-NEXT: v_mov_b32_e32 v19, v2 +; GFX90A-NEXT: v_mov_b32_e32 v20, v2 +; GFX90A-NEXT: v_mov_b32_e32 v21, v2 +; GFX90A-NEXT: v_mov_b32_e32 v22, v2 +; GFX90A-NEXT: v_mov_b32_e32 v23, v2 +; GFX90A-NEXT: v_mov_b32_e32 v24, v2 +; GFX90A-NEXT: v_mov_b32_e32 v25, v2 +; GFX90A-NEXT: v_mov_b32_e32 v26, v2 +; GFX90A-NEXT: v_mov_b32_e32 v27, v2 +; GFX90A-NEXT: v_mov_b32_e32 v28, v2 +; GFX90A-NEXT: v_mov_b32_e32 v29, v2 +; GFX90A-NEXT: v_mov_b32_e32 v30, v2 +; GFX90A-NEXT: v_mov_b32_e32 v31, v2 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -1635,42 +1635,42 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-LABEL: test_mfma_loop_mixed_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s1 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v9, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v11, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v13, 0 -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v15, 0 -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v17, 0 -; GFX942-NEXT: v_mov_b32_e32 v18, 0 -; GFX942-NEXT: v_mov_b32_e32 v19, 0 -; GFX942-NEXT: v_mov_b32_e32 v20, 0 -; GFX942-NEXT: v_mov_b32_e32 v21, 0 -; GFX942-NEXT: v_mov_b32_e32 v22, 0 -; GFX942-NEXT: v_mov_b32_e32 v23, 0 -; GFX942-NEXT: v_mov_b32_e32 v24, 0 -; GFX942-NEXT: v_mov_b32_e32 v25, 0 -; GFX942-NEXT: v_mov_b32_e32 v26, 0 -; GFX942-NEXT: v_mov_b32_e32 v27, 0 -; GFX942-NEXT: v_mov_b32_e32 v28, 0 -; GFX942-NEXT: v_mov_b32_e32 v29, 0 -; GFX942-NEXT: v_mov_b32_e32 v30, 0 -; GFX942-NEXT: v_mov_b32_e32 v31, 0 -; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v2 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: v_mov_b32_e32 v10, v2 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: v_mov_b32_e32 v12, v2 +; GFX942-NEXT: v_mov_b32_e32 v13, v2 +; GFX942-NEXT: v_mov_b32_e32 v14, v2 +; GFX942-NEXT: v_mov_b32_e32 v15, v2 +; GFX942-NEXT: v_mov_b32_e32 v16, v2 +; GFX942-NEXT: v_mov_b32_e32 v17, v2 +; GFX942-NEXT: v_mov_b32_e32 v18, v2 +; GFX942-NEXT: v_mov_b32_e32 v19, v2 +; GFX942-NEXT: v_mov_b32_e32 v20, v2 +; GFX942-NEXT: v_mov_b32_e32 v21, v2 +; GFX942-NEXT: v_mov_b32_e32 v22, v2 +; GFX942-NEXT: v_mov_b32_e32 v23, v2 +; GFX942-NEXT: v_mov_b32_e32 v24, v2 +; GFX942-NEXT: v_mov_b32_e32 v25, v2 +; GFX942-NEXT: v_mov_b32_e32 v26, v2 +; GFX942-NEXT: v_mov_b32_e32 v27, v2 +; GFX942-NEXT: v_mov_b32_e32 v28, v2 +; GFX942-NEXT: v_mov_b32_e32 v29, v2 +; GFX942-NEXT: v_mov_b32_e32 v30, v2 +; GFX942-NEXT: v_mov_b32_e32 v31, v2 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -2539,41 +2539,41 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_zeroinit_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v11, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v13, 0 -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v15, 0 -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v17, 0 -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_mov_b32_e32 v20, 0 -; GFX90A-NEXT: v_mov_b32_e32 v21, 0 -; GFX90A-NEXT: v_mov_b32_e32 v22, 0 -; GFX90A-NEXT: v_mov_b32_e32 v23, 0 -; GFX90A-NEXT: v_mov_b32_e32 v24, 0 -; GFX90A-NEXT: v_mov_b32_e32 v25, 0 -; GFX90A-NEXT: v_mov_b32_e32 v26, 0 -; GFX90A-NEXT: v_mov_b32_e32 v27, 0 -; GFX90A-NEXT: v_mov_b32_e32 v28, 0 -; GFX90A-NEXT: v_mov_b32_e32 v29, 0 -; GFX90A-NEXT: v_mov_b32_e32 v30, 0 -; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: s_mov_b32 s4, 16 ; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: .LBB10_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -2587,41 +2587,41 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { ; GFX942-LABEL: test_mfma_loop_zeroinit_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v9, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v11, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v13, 0 -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v15, 0 -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v17, 0 -; GFX942-NEXT: v_mov_b32_e32 v18, 0 -; GFX942-NEXT: v_mov_b32_e32 v19, 0 -; GFX942-NEXT: v_mov_b32_e32 v20, 0 -; GFX942-NEXT: v_mov_b32_e32 v21, 0 -; GFX942-NEXT: v_mov_b32_e32 v22, 0 -; GFX942-NEXT: v_mov_b32_e32 v23, 0 -; GFX942-NEXT: v_mov_b32_e32 v24, 0 -; GFX942-NEXT: v_mov_b32_e32 v25, 0 -; GFX942-NEXT: v_mov_b32_e32 v26, 0 -; GFX942-NEXT: v_mov_b32_e32 v27, 0 -; GFX942-NEXT: v_mov_b32_e32 v28, 0 -; GFX942-NEXT: v_mov_b32_e32 v29, 0 -; GFX942-NEXT: v_mov_b32_e32 v30, 0 -; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v32, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v0 +; GFX942-NEXT: v_mov_b32_e32 v18, v0 +; GFX942-NEXT: v_mov_b32_e32 v19, v0 +; GFX942-NEXT: v_mov_b32_e32 v20, v0 +; GFX942-NEXT: v_mov_b32_e32 v21, v0 +; GFX942-NEXT: v_mov_b32_e32 v22, v0 +; GFX942-NEXT: v_mov_b32_e32 v23, v0 +; GFX942-NEXT: v_mov_b32_e32 v24, v0 +; GFX942-NEXT: v_mov_b32_e32 v25, v0 +; GFX942-NEXT: v_mov_b32_e32 v26, v0 +; GFX942-NEXT: v_mov_b32_e32 v27, v0 +; GFX942-NEXT: v_mov_b32_e32 v28, v0 +; GFX942-NEXT: v_mov_b32_e32 v29, v0 +; GFX942-NEXT: v_mov_b32_e32 v30, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, v0 ; GFX942-NEXT: .LBB10_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 @@ -2732,40 +2732,40 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX90A-NEXT: s_mov_b32 s4, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v11, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v13, 0 -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v15, 0 -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v17, 0 -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_mov_b32_e32 v20, 0 -; GFX90A-NEXT: v_mov_b32_e32 v21, 0 -; GFX90A-NEXT: v_mov_b32_e32 v22, 0 -; GFX90A-NEXT: v_mov_b32_e32 v23, 0 -; GFX90A-NEXT: v_mov_b32_e32 v24, 0 -; GFX90A-NEXT: v_mov_b32_e32 v25, 0 -; GFX90A-NEXT: v_mov_b32_e32 v26, 0 -; GFX90A-NEXT: v_mov_b32_e32 v27, 0 -; GFX90A-NEXT: v_mov_b32_e32 v28, 0 -; GFX90A-NEXT: v_mov_b32_e32 v29, 0 -; GFX90A-NEXT: v_mov_b32_e32 v30, 0 -; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: s_mov_b32 s4, 16 ; GFX90A-NEXT: v_mov_b32_e32 v33, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v32 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v0 +; GFX90A-NEXT: v_mov_b32_e32 v19, v0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v0 +; GFX90A-NEXT: v_mov_b32_e32 v21, v0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v0 +; GFX90A-NEXT: v_mov_b32_e32 v23, v0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v0 +; GFX90A-NEXT: v_mov_b32_e32 v25, v0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v0 +; GFX90A-NEXT: v_mov_b32_e32 v27, v0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v0 +; GFX90A-NEXT: v_mov_b32_e32 v29, v0 +; GFX90A-NEXT: v_mov_b32_e32 v30, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 @@ -2780,40 +2780,40 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v9, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v11, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v13, 0 -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v15, 0 -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v17, 0 -; GFX942-NEXT: v_mov_b32_e32 v18, 0 -; GFX942-NEXT: v_mov_b32_e32 v19, 0 -; GFX942-NEXT: v_mov_b32_e32 v20, 0 -; GFX942-NEXT: v_mov_b32_e32 v21, 0 -; GFX942-NEXT: v_mov_b32_e32 v22, 0 -; GFX942-NEXT: v_mov_b32_e32 v23, 0 -; GFX942-NEXT: v_mov_b32_e32 v24, 0 -; GFX942-NEXT: v_mov_b32_e32 v25, 0 -; GFX942-NEXT: v_mov_b32_e32 v26, 0 -; GFX942-NEXT: v_mov_b32_e32 v27, 0 -; GFX942-NEXT: v_mov_b32_e32 v28, 0 -; GFX942-NEXT: v_mov_b32_e32 v29, 0 -; GFX942-NEXT: v_mov_b32_e32 v30, 0 -; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v33, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, v32 +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v10, v0 +; GFX942-NEXT: v_mov_b32_e32 v11, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v0 +; GFX942-NEXT: v_mov_b32_e32 v13, v0 +; GFX942-NEXT: v_mov_b32_e32 v14, v0 +; GFX942-NEXT: v_mov_b32_e32 v15, v0 +; GFX942-NEXT: v_mov_b32_e32 v16, v0 +; GFX942-NEXT: v_mov_b32_e32 v17, v0 +; GFX942-NEXT: v_mov_b32_e32 v18, v0 +; GFX942-NEXT: v_mov_b32_e32 v19, v0 +; GFX942-NEXT: v_mov_b32_e32 v20, v0 +; GFX942-NEXT: v_mov_b32_e32 v21, v0 +; GFX942-NEXT: v_mov_b32_e32 v22, v0 +; GFX942-NEXT: v_mov_b32_e32 v23, v0 +; GFX942-NEXT: v_mov_b32_e32 v24, v0 +; GFX942-NEXT: v_mov_b32_e32 v25, v0 +; GFX942-NEXT: v_mov_b32_e32 v26, v0 +; GFX942-NEXT: v_mov_b32_e32 v27, v0 +; GFX942-NEXT: v_mov_b32_e32 v28, v0 +; GFX942-NEXT: v_mov_b32_e32 v29, v0 +; GFX942-NEXT: v_mov_b32_e32 v30, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, v0 ; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index 071dfb9bf2c1f..401ad6489bf6e 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_mov_b32 s2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_mov_b32 s6, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/phi-av-pressure.ll b/llvm/test/CodeGen/AMDGPU/phi-av-pressure.ll new file mode 100644 index 0000000000000..016931452d657 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/phi-av-pressure.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck %s -check-prefix=GFX950 + +declare hidden i32 @_ZN25__hip_builtin_threadIdx_t7__get_xEv() + +; Before #177352 this test showed poor scheduling due to register pressure +; problems. The symptom was that two global_load instructions were immediately +; followed by s_waitcnt vmcnt(0). + +define amdgpu_kernel void @main(i1 %arg, ptr %ptr, ptr addrspace(1) %ptr1, ptr addrspace(5) %ptr5) { +; GFX950-LABEL: main: +; GFX950: ; %bb.0: ; %bb +; GFX950-NEXT: s_load_dword s33, s[4:5], 0x3c +; GFX950-NEXT: s_mov_b32 s14, s10 +; GFX950-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX950-NEXT: s_load_dword s6, s[4:5], 0x24 +; GFX950-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x2c +; GFX950-NEXT: s_mov_b32 s12, s8 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: scratch_load_dwordx4 v[40:43], off, s33 +; GFX950-NEXT: s_mov_b32 s13, s9 +; GFX950-NEXT: s_bitcmp1_b32 s6, 0 +; GFX950-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GFX950-NEXT: s_add_u32 s8, s4, 64 +; GFX950-NEXT: s_addc_u32 s9, s5, 0 +; GFX950-NEXT: s_getpc_b64 s[16:17] +; GFX950-NEXT: s_add_u32 s16, s16, _ZN25__hip_builtin_threadIdx_t7__get_xEv@rel32@lo+4 +; GFX950-NEXT: s_addc_u32 s17, s17, _ZN25__hip_builtin_threadIdx_t7__get_xEv@rel32@hi+12 +; GFX950-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX950-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v31, v0 +; GFX950-NEXT: s_mov_b32 s32, 0 +; GFX950-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX950-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-NEXT: v_lshl_add_u64 v[10:11], v[0:1], 3, s[38:39] +; GFX950-NEXT: global_load_dwordx4 v[2:5], v[10:11], off +; GFX950-NEXT: global_load_dwordx4 v[6:9], v1, s[38:39] +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: v_mov_b64_e32 v[10:11], s[36:37] +; GFX950-NEXT: s_and_b64 vcc, exec, s[34:35] +; GFX950-NEXT: v_mov_b32_e32 v12, v1 +; GFX950-NEXT: v_mov_b32_e32 v14, v1 +; GFX950-NEXT: v_mov_b32_e32 v15, v1 +; GFX950-NEXT: v_mov_b32_e32 v16, v1 +; GFX950-NEXT: v_mov_b32_e32 v18, v1 +; GFX950-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-NEXT: v_mov_b32_e32 v19, v1 +; GFX950-NEXT: v_mov_b32_e32 v20, v1 +; GFX950-NEXT: v_mov_b32_e32 v21, v1 +; GFX950-NEXT: .LBB0_1: ; %bb4 +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: v_mov_b32_e32 v13, v1 +; GFX950-NEXT: v_lshlrev_b64 v[22:23], 3, v[12:13] +; GFX950-NEXT: v_lshl_add_u64 v[22:23], s[38:39], 0, v[22:23] +; GFX950-NEXT: global_load_dwordx4 v[22:25], v[22:23], off +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: v_lshl_add_u64 v[26:27], v[0:1], 3, s[38:39] +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_or_b32_e32 v0, v40, v6 +; GFX950-NEXT: v_or_b32_e32 v13, v41, v7 +; GFX950-NEXT: v_or_b32_e32 v30, v42, v8 +; GFX950-NEXT: v_or_b32_e32 v31, v43, v9 +; GFX950-NEXT: global_load_dwordx4 v[26:29], v[26:27], off +; GFX950-NEXT: v_or_b32_e32 v18, v5, v18 +; GFX950-NEXT: v_or_b32_e32 v16, v4, v16 +; GFX950-NEXT: v_or_b32_e32 v15, v3, v15 +; GFX950-NEXT: v_or_b32_e32 v14, v2, v14 +; GFX950-NEXT: v_or_b32_e32 v12, 1, v12 +; GFX950-NEXT: v_mov_b32_e32 v40, 0 +; GFX950-NEXT: v_mov_b32_e32 v41, 0 +; GFX950-NEXT: v_mov_b32_e32 v42, 0 +; GFX950-NEXT: v_mov_b32_e32 v43, 0 +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_or_b32_e32 v25, v25, v31 +; GFX950-NEXT: v_or_b32_e32 v24, v24, v30 +; GFX950-NEXT: v_or_b32_e32 v23, v23, v13 +; GFX950-NEXT: v_or_b32_e32 v22, v22, v0 +; GFX950-NEXT: scratch_store_dwordx4 off, v[22:25], s33 +; GFX950-NEXT: flat_load_dword v0, v[10:11] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_or_b32_e32 v21, v29, v21 +; GFX950-NEXT: v_or_b32_e32 v20, v28, v20 +; GFX950-NEXT: v_or_b32_e32 v19, v27, v19 +; GFX950-NEXT: v_or_b32_e32 v17, v26, v17 +; GFX950-NEXT: s_mov_b64 vcc, vcc +; GFX950-NEXT: s_cbranch_vccz .LBB0_1 +; GFX950-NEXT: ; %bb.2: ; %bb2 +; GFX950-NEXT: v_or_b32_e32 v3, v21, v18 +; GFX950-NEXT: v_or_b32_e32 v2, v20, v16 +; GFX950-NEXT: v_or_b32_e32 v1, v19, v15 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: v_or_b32_e32 v0, v17, v14 +; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s33 +; GFX950-NEXT: s_endpgm +bb: + %i = load <4 x i32>, ptr addrspace(5) %ptr5 + %i1 = tail call i32 @_ZN25__hip_builtin_threadIdx_t7__get_xEv() + br label %bb4 + +bb2: ; preds = %bb4 + %i3 = or <4 x i32> %i17, %i13 + store <4 x i32> %i3, ptr addrspace(5) %ptr5 + ret void + +bb4: ; preds = %bb4, %bb + %i5 = phi <4 x i32> [ %i, %bb ], [ zeroinitializer, %bb4 ] + %i6 = phi i32 [ 0, %bb ], [ %i24, %bb4 ] + %i7 = phi i32 [ 0, %bb ], [ %i25, %bb4 ] + %i8 = phi <4 x i32> [ zeroinitializer, %bb ], [ %i17, %bb4 ] + %i9 = phi <4 x i32> [ zeroinitializer, %bb ], [ %i13, %bb4 ] + %i10 = zext i32 %i1 to i64 + %i11 = getelementptr i64, ptr addrspace(1) %ptr1, i64 %i10 + %i12 = load <4 x i32>, ptr addrspace(1) %i11 + %i13 = or <4 x i32> %i12, %i9 + %i14 = zext i32 %i6 to i64 + %i15 = getelementptr i64, ptr addrspace(1) %ptr1, i64 %i14 + %i16 = load <4 x i32>, ptr addrspace(1) %i15 + %i17 = or <4 x i32> %i16, %i8 + %i18 = zext i32 %i7 to i64 + %i19 = getelementptr i64, ptr addrspace(1) %ptr1, i64 %i18 + %i20 = load <4 x i32>, ptr addrspace(1) %i19 + %i21 = load <4 x i32>, ptr addrspace(1) %ptr1 + %i22 = or <4 x i32> %i5, %i21 + %i23 = or <4 x i32> %i20, %i22 + store <4 x i32> %i23, ptr addrspace(5) %ptr5 + %i24 = load i32, ptr %ptr + %i25 = or i32 %i7, 1 + br i1 %arg, label %bb2, label %bb4 +} diff --git a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir index 31ed09b8a1516..6146e811e0209 100644 --- a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir +++ b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir @@ -56,61 +56,6 @@ body: | S_BRANCH %bb.2 ... ---- -name: phi_moveimm_subreg_input -tracksRegLiveness: true -body: | - ; GCN-LABEL: name: phi_moveimm_subreg_input - ; GCN: bb.0: - ; GCN-NEXT: successors: %bb.1(0x80000000) - ; GCN-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64 = V_MOV_B64_e32 0, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO [[COPY]], [[COPY1]], implicit-def $scc - ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_ADD_U]], implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B64_e32_]].sub0, %bb.3, [[COPY2]].sub0, %bb.1 - ; GCN-NEXT: S_BRANCH %bb.3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: - ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_BRANCH %bb.2 - bb.0: - successors: %bb.1 - liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3 - - %0:vreg_64 = V_MOV_B64_e32 0, implicit $exec - - %4:sreg_64 = COPY $sgpr0_sgpr1 - %5:sreg_64 = COPY $sgpr2_sgpr3 - - bb.1: - successors: %bb.2 - %2:sreg_64 = S_ADD_U64_PSEUDO %4, %5, implicit-def $scc - S_BRANCH %bb.2 - - bb.2: - successors: %bb.3 - %3:sreg_32 = PHI %1.sub0:sreg_64, %bb.3, %2.sub0:sreg_64, %bb.1 - S_BRANCH %bb.3 - - bb.3: - successors: %bb.2 - %1:sreg_64 = COPY %0.sub0:vreg_64 - S_BRANCH %bb.2 -... - --- name: phi_moveimm_bad_opcode_input tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 1156f2718cf1e..098a60dd61a1c 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -722,8 +722,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x2800, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x7f ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x7f ; GFX90A-NEXT: s_movk_i32 s2, 0xf000 ; GFX90A-NEXT: s_movk_i32 s3, 0x1000 ; GFX90A-NEXT: s_movk_i32 s4, 0x2000 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index e7bc851817f3a..8f8e2c0ba52fc 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -113,16 +113,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 6 ; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 7 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_3 +; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1] -; GLOBALNESS1-NEXT: s_branch .LBB1_4 -; GLOBALNESS1-NEXT: .LBB1_3: ; %bb73.i +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 +; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off -; GLOBALNESS1-NEXT: s_branch .LBB1_2 +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1] +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2 @@ -171,8 +171,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_3 ; GLOBALNESS1-NEXT: ; %bb.10: ; %baz.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[44:45] @@ -181,7 +183,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[52:53], s[86:87] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 +; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_25 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off @@ -210,7 +212,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[68:69] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 +; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 @@ -271,11 +273,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_14 -; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr58_vgpr59 -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_4 -; GLOBALNESS1-NEXT: s_branch .LBB1_29 -; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow23 +; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0 ; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8 @@ -285,21 +283,25 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_mov_b32 s55, s7 ; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11 -; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow24 +; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[86:87] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 -; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i +; GLOBALNESS1-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 4 ; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 5 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 -; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i +; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 +; GLOBALNESS1-NEXT: .LBB1_28: ; %bb73.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_29: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 @@ -422,16 +424,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 6 ; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 7 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_3 +; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1] -; GLOBALNESS0-NEXT: s_branch .LBB1_4 -; GLOBALNESS0-NEXT: .LBB1_3: ; %bb73.i +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 +; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off -; GLOBALNESS0-NEXT: s_branch .LBB1_2 +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1] +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2 @@ -480,8 +482,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow25 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_3 ; GLOBALNESS0-NEXT: ; %bb.10: ; %baz.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[44:45] @@ -490,7 +494,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[52:53], s[86:87] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 +; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_25 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off @@ -520,7 +524,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[68:69] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 +; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 @@ -581,11 +585,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_14 -; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr58_vgpr59 -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_4 -; GLOBALNESS0-NEXT: s_branch .LBB1_29 -; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow23 +; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8 ; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10 @@ -593,21 +593,25 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b32 s55, s83 ; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9 ; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11 -; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24 +; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[86:87] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 -; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i +; GLOBALNESS0-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 4 ; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 5 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 -; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i +; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 +; GLOBALNESS0-NEXT: .LBB1_28: ; %bb73.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off +; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_29: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1