From 7d219400516d9f22f5fb833bb4f102c7b8af47a3 Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Thu, 14 Nov 2024 14:11:44 +0530 Subject: [PATCH 1/2] [CodeGen] Introduce MI flag for Live Range split instructions For some targets, it is required to identify the COPY instruction corresponds to the RA inserted live range split. Adding the new flag `MachineInstr::LRSplit` to serve the purpose. --- llvm/include/llvm/CodeGen/MachineInstr.h | 3 ++- llvm/lib/CodeGen/SplitKit.cpp | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h index 8e2574974a82d..b52205322819a 100644 --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -123,8 +123,9 @@ class MachineInstr NoUSWrap = 1 << 20, // Instruction supports geps // no unsigned signed wrap. SameSign = 1 << 21, // Both operands have the same sign. - InBounds = 1 << 22 // Pointer arithmetic remains inbounds. + InBounds = 1 << 22, // Pointer arithmetic remains inbounds. // Implies NoUSWrap. + LRSplit = 1 << 23 // Instruction for live range split. }; private: diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp index 8ec4bfbb5a330..e7fdceb7f4923 100644 --- a/llvm/lib/CodeGen/SplitKit.cpp +++ b/llvm/lib/CodeGen/SplitKit.cpp @@ -531,6 +531,7 @@ SlotIndex SplitEditor::buildSingleSubRegCopy( | getInternalReadRegState(!FirstCopy), SubIdx) .addReg(FromReg, 0, SubIdx); + CopyMI->setFlag(MachineInstr::LRSplit); SlotIndexes &Indexes = *LIS.getSlotIndexes(); if (FirstCopy) { Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot(); @@ -550,6 +551,7 @@ SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg, // The full vreg is copied. MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg); + CopyMI->setFlag(MachineInstr::LRSplit); return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot(); } From 4f9a9eb125e528d1ccfb2e1b9567ce54a637f6b7 Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Tue, 19 Nov 2024 12:14:05 +0530 Subject: [PATCH 2/2] [AMDGPU] Add liverange split instructions into BB Prolog The COPY inserted for liverange split during sgpr-regalloc pipeline currently breaks the BB prolog during the subsequent vgpr-regalloc phase while spilling and/or splitting the vector liveranges. This patch fixes it by correctly including the the LR split instructions during sgpr-regalloc and wwm-regalloc pipelines into the BB prolog. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 34 +- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 + .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 1268 +++++++++-------- .../ran-out-of-sgprs-allocation-failure.mir | 120 +- 4 files changed, 734 insertions(+), 690 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 87278f74cea8d..f1db55a2a122b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -9878,6 +9878,30 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg, return AMDGPU::COPY; } +bool SIInstrInfo::canAddToBBProlog(const MachineInstr &MI) const { + uint16_t Opcode = MI.getOpcode(); + // Check if it is SGPR spill or wwm-register spill Opcode. + if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode)) + return true; + + const MachineFunction *MF = MI.getMF(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIMachineFunctionInfo *MFI = MF->getInfo(); + + // See if this is Liverange split instruction inserted for SGPR or + // wwm-register. The implicit def inserted for wwm-registers should also be + // included as they can appear at the bb begin. + bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit); + if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF) + return false; + + Register Reg = MI.getOperand(0).getReg(); + if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg))) + return IsLRSplitInst; + + return MFI->isWWMReg(Reg); +} + bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI, Register Reg) const { // We need to handle instructions which may be inserted during register @@ -9886,20 +9910,16 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI, // needed by the prolog. However, the insertions for scalar registers can // always be placed at the BB top as they are independent of the exec mask // value. - const MachineFunction *MF = MI.getMF(); bool IsNullOrVectorRegister = true; if (Reg) { + const MachineFunction *MF = MI.getMF(); const MachineRegisterInfo &MRI = MF->getRegInfo(); IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)); } - uint16_t Opcode = MI.getOpcode(); - const SIMachineFunctionInfo *MFI = MF->getInfo(); return IsNullOrVectorRegister && - (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) || - (Opcode == AMDGPU::IMPLICIT_DEF && - MFI->isWWMReg(MI.getOperand(0).getReg())) || - (!MI.isTerminator() && Opcode != AMDGPU::COPY && + (canAddToBBProlog(MI) || + (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && MI.modifiesRegister(AMDGPU::EXEC, &RI))); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 9373cdb199e29..d68a88f037fd8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1579,6 +1579,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg = Register()) const override; + bool canAddToBBProlog(const MachineInstr &MI) const; + MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 4c5c56a49fdc6..2fef934fa472e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -177734,13 +177734,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s10, s16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_writelane_b32 v63, s30, 0 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_writelane_b32 v61, s29, 0 ; SI-NEXT: v_writelane_b32 v61, s28, 1 ; SI-NEXT: v_writelane_b32 v61, s27, 2 -; SI-NEXT: s_mov_b32 s61, s21 -; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v61, s26, 3 +; SI-NEXT: v_writelane_b32 v61, s25, 4 +; SI-NEXT: v_writelane_b32 v61, s24, 5 +; SI-NEXT: v_writelane_b32 v61, s23, 6 +; SI-NEXT: v_writelane_b32 v61, s22, 7 +; SI-NEXT: v_writelane_b32 v61, s21, 8 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 ; SI-NEXT: v_writelane_b32 v63, s35, 3 @@ -177774,59 +177779,52 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_writelane_b32 v63, s87, 31 ; SI-NEXT: v_writelane_b32 v63, s96, 32 ; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: s_mov_b32 s67, s19 -; SI-NEXT: s_mov_b32 s54, s17 -; SI-NEXT: s_mov_b32 s35, s23 -; SI-NEXT: s_mov_b32 s39, s26 -; SI-NEXT: s_mov_b32 s62, s25 +; SI-NEXT: s_mov_b32 s72, s19 +; SI-NEXT: s_mov_b32 s73, s17 +; SI-NEXT: s_mov_b32 s60, s20 ; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s99, v1 -; SI-NEXT: v_readfirstlane_b32 s74, v24 +; SI-NEXT: v_readfirstlane_b32 s31, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v28 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s6, v23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v62, s74, 0 -; SI-NEXT: v_readfirstlane_b32 s12, v26 -; SI-NEXT: v_writelane_b32 v62, s6, 1 -; SI-NEXT: v_readfirstlane_b32 s14, v25 -; SI-NEXT: v_writelane_b32 v62, s12, 2 -; SI-NEXT: v_readfirstlane_b32 s46, v28 -; SI-NEXT: v_writelane_b32 v62, s14, 3 -; SI-NEXT: v_readfirstlane_b32 s56, v27 -; SI-NEXT: v_writelane_b32 v62, s46, 4 -; SI-NEXT: v_readfirstlane_b32 s57, v30 -; SI-NEXT: v_writelane_b32 v62, s56, 5 -; SI-NEXT: v_readfirstlane_b32 s59, v29 -; SI-NEXT: v_writelane_b32 v62, s57, 6 -; SI-NEXT: v_writelane_b32 v62, s59, 7 -; SI-NEXT: s_mov_b32 s60, s20 -; SI-NEXT: s_mov_b32 s63, s24 -; SI-NEXT: v_readfirstlane_b32 s95, v3 -; SI-NEXT: v_readfirstlane_b32 s31, v5 -; SI-NEXT: v_readfirstlane_b32 s24, v9 -; SI-NEXT: v_readfirstlane_b32 s38, v12 -; SI-NEXT: v_readfirstlane_b32 s36, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v14 -; SI-NEXT: v_readfirstlane_b32 s27, v13 -; SI-NEXT: v_readfirstlane_b32 s9, v16 -; SI-NEXT: v_readfirstlane_b32 s79, v15 -; SI-NEXT: v_readfirstlane_b32 s13, v18 -; SI-NEXT: v_readfirstlane_b32 s15, v17 -; SI-NEXT: v_readfirstlane_b32 s42, v20 -; SI-NEXT: v_readfirstlane_b32 s43, v19 -; SI-NEXT: v_readfirstlane_b32 s44, v22 +; SI-NEXT: v_readfirstlane_b32 s41, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v62, s12, 0 +; SI-NEXT: v_readfirstlane_b32 s46, v30 +; SI-NEXT: v_writelane_b32 v62, s41, 1 +; SI-NEXT: v_readfirstlane_b32 s56, v29 +; SI-NEXT: v_writelane_b32 v62, s46, 2 +; SI-NEXT: v_writelane_b32 v62, s56, 3 +; SI-NEXT: s_mov_b32 s10, s16 +; SI-NEXT: v_readfirstlane_b32 s36, v3 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s99, v6 +; SI-NEXT: v_readfirstlane_b32 s94, v5 +; SI-NEXT: v_readfirstlane_b32 s38, v7 +; SI-NEXT: v_readfirstlane_b32 s91, v10 +; SI-NEXT: v_readfirstlane_b32 s88, v9 +; SI-NEXT: v_readfirstlane_b32 s90, v12 +; SI-NEXT: v_readfirstlane_b32 s16, v11 +; SI-NEXT: v_readfirstlane_b32 s24, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s27, v16 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s79, v18 +; SI-NEXT: v_readfirstlane_b32 s13, v17 +; SI-NEXT: v_readfirstlane_b32 s40, v20 +; SI-NEXT: v_readfirstlane_b32 s42, v19 +; SI-NEXT: v_readfirstlane_b32 s43, v22 +; SI-NEXT: v_readfirstlane_b32 s44, v21 +; SI-NEXT: v_readfirstlane_b32 s78, v24 +; SI-NEXT: v_readfirstlane_b32 s37, v23 +; SI-NEXT: v_readfirstlane_b32 s28, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 -; SI-NEXT: v_writelane_b32 v61, s4, 3 -; SI-NEXT: v_readfirstlane_b32 s45, v21 -; SI-NEXT: v_readfirstlane_b32 s98, v10 -; SI-NEXT: v_readfirstlane_b32 s90, v8 -; SI-NEXT: v_readfirstlane_b32 s88, v7 -; SI-NEXT: v_readfirstlane_b32 s91, v6 -; SI-NEXT: v_readfirstlane_b32 s93, v4 -; SI-NEXT: v_readfirstlane_b32 s55, v2 +; SI-NEXT: v_writelane_b32 v61, s4, 9 +; SI-NEXT: v_readfirstlane_b32 s7, v25 +; SI-NEXT: v_readfirstlane_b32 s95, v8 +; SI-NEXT: v_readfirstlane_b32 s96, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill @@ -177844,375 +177842,375 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 -; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_writelane_b32 v61, s4, 10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 -; SI-NEXT: v_writelane_b32 v61, s4, 5 +; SI-NEXT: v_writelane_b32 v61, s4, 11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 -; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: v_writelane_b32 v61, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 -; SI-NEXT: v_writelane_b32 v61, s4, 7 +; SI-NEXT: v_writelane_b32 v61, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:308 -; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 -; SI-NEXT: v_writelane_b32 v61, s4, 9 +; SI-NEXT: v_writelane_b32 v61, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 -; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: v_writelane_b32 v61, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 -; SI-NEXT: v_writelane_b32 v61, s4, 11 +; SI-NEXT: v_writelane_b32 v61, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 -; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: v_writelane_b32 v61, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 -; SI-NEXT: v_writelane_b32 v61, s4, 13 +; SI-NEXT: v_writelane_b32 v61, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 -; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: v_writelane_b32 v61, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 -; SI-NEXT: v_writelane_b32 v61, s4, 15 +; SI-NEXT: v_writelane_b32 v61, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 -; SI-NEXT: v_writelane_b32 v61, s4, 16 +; SI-NEXT: v_writelane_b32 v61, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v61, s4, 17 +; SI-NEXT: v_writelane_b32 v61, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 -; SI-NEXT: v_writelane_b32 v61, s4, 18 +; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 -; SI-NEXT: v_writelane_b32 v61, s4, 19 +; SI-NEXT: v_writelane_b32 v61, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 -; SI-NEXT: v_writelane_b32 v61, s4, 20 +; SI-NEXT: v_writelane_b32 v61, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v61, s4, 21 +; SI-NEXT: v_writelane_b32 v61, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 -; SI-NEXT: v_writelane_b32 v61, s4, 22 +; SI-NEXT: v_writelane_b32 v61, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 -; SI-NEXT: v_writelane_b32 v61, s4, 23 +; SI-NEXT: v_writelane_b32 v61, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244 -; SI-NEXT: v_writelane_b32 v61, s4, 24 +; SI-NEXT: v_writelane_b32 v61, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 -; SI-NEXT: v_writelane_b32 v61, s4, 25 +; SI-NEXT: v_writelane_b32 v61, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236 -; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v61, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 -; SI-NEXT: v_writelane_b32 v61, s4, 27 +; SI-NEXT: v_writelane_b32 v61, s4, 33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228 -; SI-NEXT: v_writelane_b32 v61, s4, 28 +; SI-NEXT: v_writelane_b32 v61, s4, 34 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 -; SI-NEXT: v_writelane_b32 v61, s4, 29 +; SI-NEXT: v_writelane_b32 v61, s4, 35 +; SI-NEXT: v_writelane_b32 v61, s73, 36 +; SI-NEXT: v_writelane_b32 v61, s10, 37 +; SI-NEXT: v_writelane_b32 v61, s72, 38 +; SI-NEXT: v_writelane_b32 v61, s18, 39 +; SI-NEXT: v_writelane_b32 v61, s60, 40 +; SI-NEXT: v_writelane_b32 v61, s31, 41 +; SI-NEXT: v_writelane_b32 v61, s36, 42 +; SI-NEXT: v_writelane_b32 v61, s99, 43 +; SI-NEXT: v_writelane_b32 v61, s94, 44 +; SI-NEXT: v_writelane_b32 v61, s38, 45 +; SI-NEXT: v_writelane_b32 v61, s91, 46 +; SI-NEXT: v_writelane_b32 v61, s88, 47 +; SI-NEXT: v_writelane_b32 v61, s90, 48 +; SI-NEXT: v_writelane_b32 v61, s16, 49 +; SI-NEXT: v_writelane_b32 v61, s24, 50 +; SI-NEXT: v_writelane_b32 v61, s8, 51 +; SI-NEXT: v_writelane_b32 v61, s27, 52 +; SI-NEXT: v_writelane_b32 v61, s9, 53 +; SI-NEXT: v_writelane_b32 v61, s79, 54 +; SI-NEXT: v_writelane_b32 v61, s13, 55 +; SI-NEXT: v_writelane_b32 v61, s40, 56 +; SI-NEXT: v_writelane_b32 v61, s42, 57 +; SI-NEXT: v_writelane_b32 v61, s43, 58 +; SI-NEXT: v_writelane_b32 v61, s44, 59 +; SI-NEXT: v_writelane_b32 v61, s78, 60 +; SI-NEXT: v_writelane_b32 v61, s37, 61 +; SI-NEXT: v_writelane_b32 v61, s28, 62 +; SI-NEXT: v_writelane_b32 v61, s7, 63 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s93, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v61, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s68, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 -; SI-NEXT: v_writelane_b32 v61, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s89, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 -; SI-NEXT: v_writelane_b32 v61, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s16, v31 +; SI-NEXT: v_readfirstlane_b32 s30, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s34, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v61, s4, 33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s89, v31 +; SI-NEXT: v_readfirstlane_b32 s39, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s53, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 -; SI-NEXT: v_writelane_b32 v61, s4, 34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s73, v31 +; SI-NEXT: v_readfirstlane_b32 s45, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s81, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 -; SI-NEXT: v_writelane_b32 v61, s4, 35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s72, v31 +; SI-NEXT: v_readfirstlane_b32 s66, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s40, v31 +; SI-NEXT: v_readfirstlane_b32 s21, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s21, v31 +; SI-NEXT: v_readfirstlane_b32 s69, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s85, v31 +; SI-NEXT: v_readfirstlane_b32 s97, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s81, v31 +; SI-NEXT: v_readfirstlane_b32 s25, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s97, v31 +; SI-NEXT: v_readfirstlane_b32 s85, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s7, v31 +; SI-NEXT: v_readfirstlane_b32 s29, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s11, v31 +; SI-NEXT: v_readfirstlane_b32 s14, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s41, v31 +; SI-NEXT: v_readfirstlane_b32 s11, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s47, v31 +; SI-NEXT: v_readfirstlane_b32 s57, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s58, v31 +; SI-NEXT: v_readfirstlane_b32 s47, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s76, v31 +; SI-NEXT: v_readfirstlane_b32 s75, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s29, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s50, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: v_writelane_b32 v61, s4, 36 -; SI-NEXT: v_writelane_b32 v61, s54, 37 -; SI-NEXT: v_writelane_b32 v61, s10, 38 -; SI-NEXT: v_writelane_b32 v61, s67, 39 -; SI-NEXT: v_writelane_b32 v61, s18, 40 -; SI-NEXT: v_writelane_b32 v61, s61, 41 -; SI-NEXT: v_writelane_b32 v61, s60, 42 -; SI-NEXT: v_writelane_b32 v61, s35, 43 -; SI-NEXT: v_writelane_b32 v61, s22, 44 -; SI-NEXT: v_writelane_b32 v61, s62, 45 -; SI-NEXT: v_writelane_b32 v61, s63, 46 -; SI-NEXT: v_writelane_b32 v61, s39, 47 -; SI-NEXT: v_writelane_b32 v61, s99, 48 -; SI-NEXT: v_writelane_b32 v61, s95, 49 -; SI-NEXT: v_writelane_b32 v61, s31, 50 -; SI-NEXT: v_writelane_b32 v61, s24, 51 -; SI-NEXT: v_writelane_b32 v61, s38, 52 -; SI-NEXT: v_writelane_b32 v61, s36, 53 -; SI-NEXT: v_writelane_b32 v61, s8, 54 -; SI-NEXT: v_writelane_b32 v61, s27, 55 -; SI-NEXT: v_writelane_b32 v61, s9, 56 -; SI-NEXT: v_writelane_b32 v61, s79, 57 -; SI-NEXT: v_writelane_b32 v61, s13, 58 -; SI-NEXT: v_writelane_b32 v61, s15, 59 -; SI-NEXT: v_writelane_b32 v61, s42, 60 -; SI-NEXT: v_writelane_b32 v61, s43, 61 -; SI-NEXT: v_writelane_b32 v61, s44, 62 -; SI-NEXT: v_writelane_b32 v61, s45, 63 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s37, v31 +; SI-NEXT: v_readfirstlane_b32 s76, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s50, v31 +; SI-NEXT: v_readfirstlane_b32 s54, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s48, v31 +; SI-NEXT: v_readfirstlane_b32 s35, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s19, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s64, v31 +; SI-NEXT: v_readfirstlane_b32 s48, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s17, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s65, v31 +; SI-NEXT: v_readfirstlane_b32 s52, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s71, v31 +; SI-NEXT: v_readfirstlane_b32 s49, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s70, v31 +; SI-NEXT: v_readfirstlane_b32 s65, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s83, v31 +; SI-NEXT: v_readfirstlane_b32 s67, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s49, v31 +; SI-NEXT: v_readfirstlane_b32 s64, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s80, v31 +; SI-NEXT: v_readfirstlane_b32 s70, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s82, v31 +; SI-NEXT: v_readfirstlane_b32 s71, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s87, v31 +; SI-NEXT: v_readfirstlane_b32 s83, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s84, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s51, v31 +; SI-NEXT: v_readfirstlane_b32 s55, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s86, v31 +; SI-NEXT: v_readfirstlane_b32 s80, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s94, v31 +; SI-NEXT: v_readfirstlane_b32 s62, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s96, v31 +; SI-NEXT: v_readfirstlane_b32 s82, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s68, v31 +; SI-NEXT: v_readfirstlane_b32 s63, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s34, v31 +; SI-NEXT: v_readfirstlane_b32 s87, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s77, v31 +; SI-NEXT: v_readfirstlane_b32 s61, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s66, v31 +; SI-NEXT: v_readfirstlane_b32 s86, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s78, v31 +; SI-NEXT: v_readfirstlane_b32 s22, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s53, v31 +; SI-NEXT: v_readfirstlane_b32 s51, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s69, v31 +; SI-NEXT: v_readfirstlane_b32 s74, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v31 +; SI-NEXT: v_readfirstlane_b32 s20, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s52, v31 +; SI-NEXT: v_readfirstlane_b32 s26, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: v_readfirstlane_b32 s92, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s23, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: v_readfirstlane_b32 s98, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s26, v31 +; SI-NEXT: v_readfirstlane_b32 s15, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s25, v31 +; SI-NEXT: v_readfirstlane_b32 s58, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: v_writelane_b32 v62, s25, 8 -; SI-NEXT: v_writelane_b32 v62, s28, 9 +; SI-NEXT: v_writelane_b32 v62, s58, 4 +; SI-NEXT: v_writelane_b32 v62, s98, 5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s92, v31 -; SI-NEXT: v_writelane_b32 v62, s92, 10 -; SI-NEXT: v_writelane_b32 v62, s75, 11 -; SI-NEXT: v_writelane_b32 v62, s26, 12 -; SI-NEXT: v_writelane_b32 v62, s30, 13 -; SI-NEXT: v_writelane_b32 v62, s23, 14 -; SI-NEXT: v_writelane_b32 v62, s52, 15 +; SI-NEXT: v_readfirstlane_b32 s77, v31 +; SI-NEXT: v_writelane_b32 v62, s77, 6 +; SI-NEXT: v_writelane_b32 v62, s92, 7 +; SI-NEXT: v_writelane_b32 v62, s15, 8 +; SI-NEXT: v_writelane_b32 v62, s20, 9 +; SI-NEXT: v_writelane_b32 v62, s23, 10 +; SI-NEXT: v_writelane_b32 v62, s26, 11 +; SI-NEXT: v_writelane_b32 v62, s48, 12 +; SI-NEXT: v_writelane_b32 v62, s17, 13 +; SI-NEXT: v_writelane_b32 v62, s52, 14 +; SI-NEXT: v_writelane_b32 v62, s65, 15 ; SI-NEXT: v_writelane_b32 v62, s64, 16 -; SI-NEXT: v_writelane_b32 v62, s17, 17 -; SI-NEXT: v_writelane_b32 v62, s65, 18 -; SI-NEXT: v_writelane_b32 v62, s70, 19 -; SI-NEXT: v_writelane_b32 v62, s71, 20 -; SI-NEXT: v_writelane_b32 v62, s49, 21 -; SI-NEXT: v_writelane_b32 v62, s83, 22 -; SI-NEXT: v_writelane_b32 v62, s80, 23 +; SI-NEXT: v_writelane_b32 v62, s49, 17 +; SI-NEXT: v_writelane_b32 v62, s67, 18 +; SI-NEXT: v_writelane_b32 v62, s71, 19 +; SI-NEXT: v_writelane_b32 v62, s70, 20 +; SI-NEXT: v_writelane_b32 v62, s84, 21 +; SI-NEXT: v_writelane_b32 v62, s80, 22 +; SI-NEXT: v_writelane_b32 v62, s83, 23 ; SI-NEXT: v_writelane_b32 v62, s82, 24 -; SI-NEXT: v_writelane_b32 v62, s84, 25 -; SI-NEXT: v_writelane_b32 v62, s87, 26 +; SI-NEXT: v_writelane_b32 v62, s87, 25 +; SI-NEXT: v_writelane_b32 v62, s51, 26 ; SI-NEXT: v_writelane_b32 v62, s86, 27 -; SI-NEXT: v_writelane_b32 v62, s51, 28 -; SI-NEXT: v_writelane_b32 v62, s96, 29 -; SI-NEXT: v_writelane_b32 v62, s34, 30 -; SI-NEXT: v_writelane_b32 v62, s94, 31 -; SI-NEXT: v_writelane_b32 v62, s53, 32 -; SI-NEXT: v_writelane_b32 v62, s66, 33 -; SI-NEXT: v_writelane_b32 v62, s68, 34 -; SI-NEXT: v_writelane_b32 v62, s69, 35 -; SI-NEXT: v_writelane_b32 v62, s77, 36 -; SI-NEXT: v_writelane_b32 v62, s78, 37 +; SI-NEXT: v_writelane_b32 v62, s55, 28 +; SI-NEXT: v_writelane_b32 v62, s62, 29 +; SI-NEXT: v_writelane_b32 v62, s63, 30 +; SI-NEXT: v_writelane_b32 v62, s74, 31 +; SI-NEXT: v_writelane_b32 v62, s61, 32 +; SI-NEXT: v_writelane_b32 v62, s22, 33 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_lshl_b32 s5, s73, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s67, 8 +; SI-NEXT: s_lshl_b32 s5, s72, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v61, 8 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s35, 8 +; SI-NEXT: v_readlane_b32 s4, v61, 7 +; SI-NEXT: v_readlane_b32 s5, v61, 6 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: v_readlane_b32 s4, v61, 5 +; SI-NEXT: v_readlane_b32 s5, v61, 4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 3 ; SI-NEXT: v_readlane_b32 s5, v61, 2 -; SI-NEXT: s_and_b32 s4, s39, 0xff +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 1 @@ -178221,466 +178219,490 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s99, 0xff -; SI-NEXT: s_lshl_b32 s5, s55, 8 +; SI-NEXT: s_and_b32 s4, s31, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s95, 0xff -; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s36, 0xff +; SI-NEXT: s_lshl_b32 s5, s96, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s31, 0xff -; SI-NEXT: s_lshl_b32 s5, s91, 8 +; SI-NEXT: s_and_b32 s4, s94, 0xff +; SI-NEXT: s_lshl_b32 s5, s99, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s88, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_and_b32 s4, s38, 0xff +; SI-NEXT: s_lshl_b32 s5, s95, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s98, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s91, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s38, 8 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s90, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s27, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s24, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s79, 0xff -; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s79, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_and_b32 s4, s42, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_and_b32 s4, s44, 0xff +; SI-NEXT: s_lshl_b32 s5, s43, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s74, 8 +; SI-NEXT: s_and_b32 s4, s37, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 -; SI-NEXT: s_and_b32 s4, s56, 0xff -; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s59, 0xff -; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s92, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_and_b32 s4, s77, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_lshl_b32 s5, s98, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 ; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_lshl_b32 s5, s92, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_and_b32 s4, s52, 0xff -; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s20, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_and_b32 s4, s69, 0xff -; SI-NEXT: s_lshl_b32 s5, s53, 8 +; SI-NEXT: s_and_b32 s4, s74, 0xff +; SI-NEXT: s_lshl_b32 s5, s51, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_and_b32 s4, s78, 0xff -; SI-NEXT: s_lshl_b32 s5, s66, 8 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s86, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_and_b32 s4, s77, 0xff -; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_and_b32 s4, s61, 0xff +; SI-NEXT: s_lshl_b32 s5, s87, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s68, 0xff -; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s82, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_and_b32 s4, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_and_b32 s4, s62, 0xff +; SI-NEXT: s_lshl_b32 s5, s80, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_and_b32 s4, s51, 0xff +; SI-NEXT: s_and_b32 s4, s55, 0xff ; SI-NEXT: s_lshl_b32 s5, s84, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_and_b32 s4, s87, 0xff -; SI-NEXT: s_lshl_b32 s5, s82, 8 +; SI-NEXT: s_and_b32 s4, s83, 0xff +; SI-NEXT: s_lshl_b32 s5, s71, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_and_b32 s4, s80, 0xff -; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_and_b32 s4, s70, 0xff +; SI-NEXT: s_lshl_b32 s5, s64, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_and_b32 s4, s83, 0xff -; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: s_and_b32 s4, s67, 0xff +; SI-NEXT: s_lshl_b32 s5, s65, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_and_b32 s4, s71, 0xff -; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: s_and_b32 s4, s49, 0xff +; SI-NEXT: s_lshl_b32 s5, s52, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 ; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s64, 8 +; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 ; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: s_lshl_b32 s5, s35, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_and_b32 s4, s50, 0xff -; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_and_b32 s4, s54, 0xff +; SI-NEXT: s_lshl_b32 s5, s76, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s8, v61, 36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_and_b32 s4, s50, 0xff +; SI-NEXT: s_lshl_b32 s5, s59, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_and_b32 s4, s76, 0xff -; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_and_b32 s4, s75, 0xff +; SI-NEXT: s_lshl_b32 s5, s47, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: s_lshl_b32 s5, s41, 8 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s11, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_and_b32 s4, s97, 0xff -; SI-NEXT: s_lshl_b32 s5, s81, 8 +; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_and_b32 s4, s85, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_and_b32 s4, s97, 0xff +; SI-NEXT: s_lshl_b32 s5, s69, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s66, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s69, v61, 35 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_and_b32 s4, s69, 0xff -; SI-NEXT: s_lshl_b32 s5, s73, 8 +; SI-NEXT: s_and_b32 s4, s81, 0xff +; SI-NEXT: s_lshl_b32 s5, s45, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s68, v61, 34 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_and_b32 s4, s68, 0xff -; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: s_and_b32 s4, s53, 0xff +; SI-NEXT: s_lshl_b32 s5, s39, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s66, v61, 33 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_and_b32 s4, s66, 0xff -; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_lshl_b32 s5, s30, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s53, v61, 32 -; SI-NEXT: v_readlane_b32 s94, v61, 31 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_and_b32 s4, s53, 0xff -; SI-NEXT: s_lshl_b32 s5, s94, 8 +; SI-NEXT: s_and_b32 s4, s89, 0xff +; SI-NEXT: s_lshl_b32 s5, s68, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s34, v61, 30 -; SI-NEXT: v_readlane_b32 s96, v61, 29 +; SI-NEXT: v_readlane_b32 s5, v61, 35 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_and_b32 s4, s34, 0xff -; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_and_b32 s4, s93, 0xff +; SI-NEXT: s_mov_b32 s99, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: v_readlane_b32 s86, v61, 27 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_and_b32 s4, s51, 0xff -; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: v_readlane_b32 s4, v61, 34 +; SI-NEXT: v_readlane_b32 s5, v61, 33 +; SI-NEXT: s_mov_b32 s31, s6 +; SI-NEXT: s_mov_b32 s6, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s55, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 32 +; SI-NEXT: v_readlane_b32 s5, v61, 31 +; SI-NEXT: s_mov_b32 s86, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s51, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s87, v61, 26 -; SI-NEXT: v_readlane_b32 s84, v61, 25 ; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_and_b32 s4, s87, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: v_readlane_b32 s4, v61, 30 +; SI-NEXT: v_readlane_b32 s5, v61, 29 +; SI-NEXT: s_mov_b32 s36, s96 +; SI-NEXT: s_mov_b32 s96, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s82, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s82, v61, 24 -; SI-NEXT: v_readlane_b32 s80, v61, 23 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: s_and_b32 s4, s82, 0xff -; SI-NEXT: s_lshl_b32 s5, s80, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 28 +; SI-NEXT: v_readlane_b32 s5, v61, 27 +; SI-NEXT: s_mov_b32 s83, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s87, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s83, v61, 22 -; SI-NEXT: v_readlane_b32 s49, v61, 21 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: s_and_b32 s4, s83, 0xff -; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: v_readlane_b32 s4, v61, 26 +; SI-NEXT: v_readlane_b32 s5, v61, 25 +; SI-NEXT: s_mov_b32 s84, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s80, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s71, v61, 20 -; SI-NEXT: v_readlane_b32 s70, v61, 19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 -; SI-NEXT: s_and_b32 s4, s71, 0xff -; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 24 +; SI-NEXT: v_readlane_b32 s5, v61, 23 +; SI-NEXT: s_mov_b32 s71, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s70, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s65, v61, 18 -; SI-NEXT: v_readlane_b32 s54, v61, 17 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 -; SI-NEXT: s_and_b32 s4, s65, 0xff -; SI-NEXT: s_lshl_b32 s5, s54, 8 -; SI-NEXT: s_mov_b32 s17, s19 -; SI-NEXT: s_mov_b32 s19, s50 +; SI-NEXT: v_readlane_b32 s4, v61, 22 +; SI-NEXT: v_readlane_b32 s5, v61, 21 +; SI-NEXT: s_mov_b32 s49, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s67, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s67, v61, 16 -; SI-NEXT: v_readlane_b32 s50, v61, 15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 -; SI-NEXT: s_and_b32 s4, s67, 0xff -; SI-NEXT: s_lshl_b32 s5, s50, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 20 +; SI-NEXT: v_readlane_b32 s5, v61, 19 +; SI-NEXT: s_mov_b32 s65, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s64, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s64, v61, 14 -; SI-NEXT: v_readlane_b32 s52, v61, 13 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: s_and_b32 s4, s64, 0xff -; SI-NEXT: s_lshl_b32 s5, s52, 8 -; SI-NEXT: s_mov_b32 s23, s48 +; SI-NEXT: v_readlane_b32 s4, v61, 18 +; SI-NEXT: v_readlane_b32 s5, v61, 17 +; SI-NEXT: s_mov_b32 s17, s19 +; SI-NEXT: s_mov_b32 s19, s54 +; SI-NEXT: s_mov_b32 s26, s50 +; SI-NEXT: s_mov_b32 s54, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s50, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s35, v61, 12 -; SI-NEXT: v_readlane_b32 s48, v61, 11 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 -; SI-NEXT: s_and_b32 s4, s35, 0xff -; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 16 +; SI-NEXT: v_readlane_b32 s5, v61, 15 +; SI-NEXT: s_mov_b32 s23, s35 +; SI-NEXT: s_mov_b32 s35, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s48, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s30, v61, 10 -; SI-NEXT: v_readlane_b32 s39, v61, 9 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s30, 0xff -; SI-NEXT: s_lshl_b32 s5, s39, 8 -; SI-NEXT: s_mov_b32 s26, s37 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s37, v61, 8 -; SI-NEXT: v_readlane_b32 s75, v61, 7 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: s_and_b32 s4, s37, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: v_readlane_b32 s4, v61, 14 +; SI-NEXT: v_readlane_b32 s5, v61, 13 +; SI-NEXT: s_mov_b32 s15, s75 +; SI-NEXT: s_mov_b32 s52, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s75, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s92, v61, 6 -; SI-NEXT: v_readlane_b32 s77, v61, 5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s92, 0xff -; SI-NEXT: s_lshl_b32 s5, s77, 8 -; SI-NEXT: s_mov_b32 s28, s29 -; SI-NEXT: s_mov_b32 s29, s76 +; SI-NEXT: v_readlane_b32 s4, v61, 12 +; SI-NEXT: v_readlane_b32 s5, v61, 11 +; SI-NEXT: s_mov_b32 s88, s30 +; SI-NEXT: s_mov_b32 s30, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s92, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s78, v61, 4 -; SI-NEXT: v_readlane_b32 s76, v61, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s78, 0xff -; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: v_readlane_b32 s4, v61, 10 +; SI-NEXT: v_readlane_b32 s5, v61, 9 +; SI-NEXT: s_mov_b32 s13, s39 +; SI-NEXT: s_mov_b32 s39, s4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s77, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_mov_b32 s99, s55 -; SI-NEXT: s_mov_b32 s20, s88 -; SI-NEXT: s_mov_b32 s24, s98 -; SI-NEXT: s_mov_b32 s59, s58 +; SI-NEXT: s_mov_b32 s38, s95 +; SI-NEXT: s_mov_b32 s20, s76 +; SI-NEXT: s_mov_b32 s98, s59 ; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s46, s41 +; SI-NEXT: s_mov_b32 s58, s57 ; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: s_mov_b32 s7, s97 -; SI-NEXT: s_mov_b32 s97, s81 -; SI-NEXT: s_mov_b32 s81, s85 -; SI-NEXT: s_mov_b32 s6, s40 -; SI-NEXT: s_mov_b32 s40, s72 -; SI-NEXT: s_mov_b32 s45, s73 -; SI-NEXT: s_mov_b32 s15, s89 +; SI-NEXT: s_mov_b32 s41, s14 +; SI-NEXT: s_mov_b32 s28, s29 +; SI-NEXT: s_mov_b32 s7, s85 +; SI-NEXT: s_mov_b32 s29, s25 +; SI-NEXT: s_mov_b32 s85, s97 +; SI-NEXT: s_mov_b32 s25, s69 +; SI-NEXT: s_mov_b32 s97, s21 +; SI-NEXT: s_mov_b32 s37, s66 +; SI-NEXT: s_mov_b32 s69, s81 +; SI-NEXT: s_mov_b32 s44, s45 +; SI-NEXT: s_mov_b32 s66, s53 +; SI-NEXT: s_mov_b32 s53, s34 +; SI-NEXT: s_mov_b32 s34, s89 +; SI-NEXT: s_mov_b32 s94, s68 +; SI-NEXT: s_mov_b32 s89, s93 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_mov_b32 s55, s93 -; SI-NEXT: s_mov_b32 s95, s91 -; SI-NEXT: s_mov_b32 s31, s90 ; SI-NEXT: s_cbranch_execnz .LBB93_3 ; SI-NEXT: .LBB93_2: ; %cmp.true -; SI-NEXT: s_add_i32 s4, s78, 3 +; SI-NEXT: s_add_i32 s4, s39, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: s_lshl_b32 s5, s77, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s92, 3 +; SI-NEXT: s_add_i32 s5, s30, 3 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 vcc_lo, s77, 8 +; SI-NEXT: s_lshl_b32 vcc_lo, s92, 8 ; SI-NEXT: s_or_b32 s5, vcc_lo, s5 -; SI-NEXT: s_add_i32 vcc_lo, s37, 3 +; SI-NEXT: s_add_i32 vcc_lo, s52, 3 ; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s75, 8 ; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo -; SI-NEXT: s_add_i32 vcc_hi, s30, 3 +; SI-NEXT: s_add_i32 vcc_hi, s35, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s60, s39, 8 +; SI-NEXT: s_lshl_b32 s60, s48, 8 ; SI-NEXT: s_or_b32 s60, s60, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s35, 3 +; SI-NEXT: s_add_i32 vcc_hi, s54, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s61, s48, 8 +; SI-NEXT: s_lshl_b32 s61, s50, 8 ; SI-NEXT: s_or_b32 s61, s61, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s64, 3 +; SI-NEXT: s_add_i32 vcc_hi, s65, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s62, s52, 8 +; SI-NEXT: s_lshl_b32 s62, s64, 8 ; SI-NEXT: s_or_b32 s62, s62, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s67, 3 +; SI-NEXT: s_add_i32 vcc_hi, s49, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s63, s50, 8 +; SI-NEXT: s_lshl_b32 s63, s67, 8 ; SI-NEXT: s_or_b32 s10, s63, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s65, 3 +; SI-NEXT: s_add_i32 vcc_hi, s71, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s72, s54, 8 +; SI-NEXT: s_lshl_b32 s72, s70, 8 ; SI-NEXT: s_or_b32 s72, s72, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s71, 3 +; SI-NEXT: s_add_i32 vcc_hi, s84, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s73, s70, 8 +; SI-NEXT: s_lshl_b32 s73, s80, 8 ; SI-NEXT: s_or_b32 s73, s73, vcc_hi ; SI-NEXT: s_add_i32 vcc_hi, s83, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s74, s49, 8 +; SI-NEXT: s_lshl_b32 s74, s87, 8 ; SI-NEXT: s_or_b32 s74, s74, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s82, 3 +; SI-NEXT: s_add_i32 vcc_hi, s96, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s75, s80, 8 +; SI-NEXT: s_lshl_b32 s75, s82, 8 ; SI-NEXT: s_or_b32 s75, s75, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s87, 3 +; SI-NEXT: s_add_i32 vcc_hi, s86, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s76, s84, 8 +; SI-NEXT: s_lshl_b32 s76, s51, 8 ; SI-NEXT: s_or_b32 s76, s76, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s51, 3 -; SI-NEXT: s_add_i32 s93, s53, 3 +; SI-NEXT: s_add_i32 vcc_hi, s6, 3 +; SI-NEXT: s_add_i32 s93, s34, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s77, s86, 8 -; SI-NEXT: s_add_i32 s89, s34, 3 +; SI-NEXT: s_lshl_b32 s77, s55, 8 +; SI-NEXT: s_add_i32 s89, s89, 3 ; SI-NEXT: s_and_b32 s93, s93, 0xff ; SI-NEXT: s_lshl_b32 s78, s94, 8 -; SI-NEXT: s_add_i32 s34, s66, 3 +; SI-NEXT: s_add_i32 s34, s53, 3 ; SI-NEXT: s_or_b32 s77, s77, vcc_hi ; SI-NEXT: s_and_b32 s89, s89, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s96, 8 +; SI-NEXT: s_lshl_b32 vcc_hi, s99, 8 ; SI-NEXT: s_or_b32 s22, s78, s93 ; SI-NEXT: s_and_b32 s93, s34, 0xff -; SI-NEXT: s_lshl_b32 s92, s16, 8 -; SI-NEXT: s_add_i32 s53, s68, 3 +; SI-NEXT: s_lshl_b32 s92, s88, 8 +; SI-NEXT: s_add_i32 s53, s66, 3 ; SI-NEXT: s_or_b32 s89, vcc_hi, s89 ; SI-NEXT: s_or_b32 s92, s92, s93 ; SI-NEXT: s_and_b32 s93, s53, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s15, 8 +; SI-NEXT: s_lshl_b32 vcc_hi, s13, 8 ; SI-NEXT: s_add_i32 s66, s69, 3 ; SI-NEXT: s_or_b32 s93, vcc_hi, s93 ; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff -; SI-NEXT: s_lshl_b32 s34, s45, 8 -; SI-NEXT: s_add_i32 s68, s6, 3 +; SI-NEXT: s_lshl_b32 s34, s44, 8 +; SI-NEXT: s_add_i32 s68, s97, 3 ; SI-NEXT: s_or_b32 vcc_hi, s34, vcc_hi ; SI-NEXT: s_and_b32 s34, s68, 0xff -; SI-NEXT: s_lshl_b32 s39, s40, 8 -; SI-NEXT: s_add_i32 s69, s81, 3 +; SI-NEXT: s_lshl_b32 s39, s37, 8 +; SI-NEXT: s_add_i32 s69, s85, 3 ; SI-NEXT: s_or_b32 s34, s39, s34 ; SI-NEXT: s_and_b32 s39, s69, 0xff -; SI-NEXT: s_lshl_b32 s52, s21, 8 +; SI-NEXT: s_lshl_b32 s52, s25, 8 ; SI-NEXT: s_add_i32 s81, s7, 3 ; SI-NEXT: s_or_b32 s39, s52, s39 ; SI-NEXT: s_and_b32 s52, s81, 0xff -; SI-NEXT: s_lshl_b32 s53, s97, 8 -; SI-NEXT: s_add_i32 s85, s12, 3 +; SI-NEXT: s_lshl_b32 s53, s29, 8 +; SI-NEXT: s_add_i32 s85, s41, 3 ; SI-NEXT: s_or_b32 s52, s53, s52 ; SI-NEXT: s_and_b32 s53, s85, 0xff -; SI-NEXT: s_lshl_b32 s64, s11, 8 -; SI-NEXT: s_add_i32 s97, s56, 3 +; SI-NEXT: s_lshl_b32 s64, s28, 8 +; SI-NEXT: s_add_i32 s97, s58, 3 ; SI-NEXT: s_or_b32 s53, s64, s53 ; SI-NEXT: s_and_b32 s64, s97, 0xff -; SI-NEXT: s_lshl_b32 s66, s46, 8 -; SI-NEXT: s_add_i32 s21, s29, 3 +; SI-NEXT: s_lshl_b32 s66, s12, 8 +; SI-NEXT: s_add_i32 s21, s15, 3 ; SI-NEXT: s_or_b32 s64, s66, s64 ; SI-NEXT: s_and_b32 s21, s21, 0xff -; SI-NEXT: s_lshl_b32 s66, s59, 8 -; SI-NEXT: s_add_i32 s25, s8, 3 +; SI-NEXT: s_lshl_b32 s66, s56, 8 +; SI-NEXT: s_add_i32 s25, s26, 3 ; SI-NEXT: s_or_b32 s66, s66, s21 ; SI-NEXT: s_and_b32 s21, s25, 0xff -; SI-NEXT: s_lshl_b32 s6, s28, 8 +; SI-NEXT: s_lshl_b32 s6, s98, 8 ; SI-NEXT: s_add_i32 s29, s19, 3 ; SI-NEXT: s_or_b32 s67, s6, s21 ; SI-NEXT: s_and_b32 s6, s29, 0xff -; SI-NEXT: s_lshl_b32 s18, s26, 8 +; SI-NEXT: s_lshl_b32 s18, s20, 8 ; SI-NEXT: s_add_i32 s28, s17, 3 ; SI-NEXT: s_or_b32 s68, s18, s6 ; SI-NEXT: s_and_b32 s6, s28, 0xff ; SI-NEXT: s_lshl_b32 s18, s23, 8 ; SI-NEXT: s_or_b32 s69, s18, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 17 +; SI-NEXT: v_readlane_b32 s6, v62, 13 ; SI-NEXT: s_add_i32 s7, s6, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 15 +; SI-NEXT: v_readlane_b32 s16, v62, 11 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v62, 16 +; SI-NEXT: v_readlane_b32 s7, v62, 12 ; SI-NEXT: s_add_i32 s27, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 13 +; SI-NEXT: v_readlane_b32 s16, v62, 9 ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_lshl_b32 s23, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 14 -; SI-NEXT: s_mov_b32 s91, s24 +; SI-NEXT: v_readlane_b32 s16, v62, 10 ; SI-NEXT: s_or_b32 s70, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 20 +; SI-NEXT: v_readlane_b32 s6, v62, 17 ; SI-NEXT: s_add_i32 s24, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 11 +; SI-NEXT: v_readlane_b32 s16, v62, 7 ; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 18 +; SI-NEXT: v_readlane_b32 s7, v62, 14 ; SI-NEXT: s_lshl_b32 s19, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 12 -; SI-NEXT: s_mov_b32 s90, s20 +; SI-NEXT: v_readlane_b32 s16, v62, 8 ; SI-NEXT: s_and_b32 s6, s11, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s20, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 9 +; SI-NEXT: v_readlane_b32 s16, v62, 5 ; SI-NEXT: s_or_b32 s71, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 22 +; SI-NEXT: v_readlane_b32 s6, v62, 18 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s17, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 10 +; SI-NEXT: v_readlane_b32 s16, v62, 6 ; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 19 +; SI-NEXT: v_readlane_b32 s7, v62, 15 ; SI-NEXT: s_or_b32 s17, s17, s20 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s20, v62, 8 +; SI-NEXT: v_readlane_b32 s20, v62, 4 ; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 ; SI-NEXT: s_or_b32 s81, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 23 +; SI-NEXT: v_readlane_b32 s6, v62, 20 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_or_b32 s16, s20, s16 -; SI-NEXT: v_readlane_b32 s20, v62, 7 +; SI-NEXT: v_readlane_b32 s20, v62, 3 ; SI-NEXT: s_add_i32 s14, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 21 +; SI-NEXT: v_readlane_b32 s7, v62, 16 ; SI-NEXT: s_or_b32 s19, s19, s24 ; SI-NEXT: s_add_i32 s98, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v62, 6 +; SI-NEXT: v_readlane_b32 s24, v62, 2 ; SI-NEXT: s_and_b32 s6, s14, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s20, s98, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_or_b32 s83, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 26 +; SI-NEXT: v_readlane_b32 s6, v62, 23 ; SI-NEXT: s_and_b32 s27, s27, 0xff ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v62, 5 +; SI-NEXT: v_readlane_b32 s24, v62, 1 ; SI-NEXT: s_add_i32 s41, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 24 +; SI-NEXT: v_readlane_b32 s7, v62, 19 ; SI-NEXT: s_or_b32 s23, s23, s27 ; SI-NEXT: s_add_i32 s86, s24, 3 -; SI-NEXT: v_readlane_b32 s27, v62, 4 +; SI-NEXT: v_readlane_b32 s27, v62, 0 ; SI-NEXT: s_and_b32 s6, s41, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s24, s86, 0xff @@ -178688,123 +178710,126 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_or_b32 s85, s7, s6 ; SI-NEXT: v_readlane_b32 s6, v62, 28 ; SI-NEXT: s_or_b32 s24, s27, s24 -; SI-NEXT: v_readlane_b32 s27, v62, 3 +; SI-NEXT: v_readlane_b32 s27, v61, 63 ; SI-NEXT: s_add_i32 s46, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 25 +; SI-NEXT: v_readlane_b32 s7, v62, 21 ; SI-NEXT: s_add_i32 s12, s73, 0x300 ; SI-NEXT: s_add_i32 s82, s27, 3 -; SI-NEXT: v_readlane_b32 s73, v62, 2 +; SI-NEXT: v_readlane_b32 s73, v61, 62 ; SI-NEXT: s_and_b32 s6, s46, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s27, s82, 0xff ; SI-NEXT: s_lshl_b32 s73, s73, 8 ; SI-NEXT: s_or_b32 s96, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 31 +; SI-NEXT: v_readlane_b32 s6, v62, 29 ; SI-NEXT: s_or_b32 s27, s73, s27 -; SI-NEXT: v_readlane_b32 s73, v62, 1 +; SI-NEXT: v_readlane_b32 s73, v61, 61 ; SI-NEXT: s_add_i32 s47, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 27 +; SI-NEXT: v_readlane_b32 s7, v62, 22 ; SI-NEXT: s_add_i32 s13, s74, 0x300 ; SI-NEXT: s_add_i32 s65, s73, 3 -; SI-NEXT: v_readlane_b32 s74, v62, 0 +; SI-NEXT: v_readlane_b32 s74, v61, 60 ; SI-NEXT: s_and_b32 s6, s47, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s73, s65, 0xff ; SI-NEXT: s_lshl_b32 s74, s74, 8 ; SI-NEXT: s_or_b32 s97, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 34 +; SI-NEXT: v_readlane_b32 s6, v62, 30 ; SI-NEXT: s_or_b32 s73, s74, s73 -; SI-NEXT: v_readlane_b32 s74, v61, 63 +; SI-NEXT: v_readlane_b32 s74, v61, 59 ; SI-NEXT: s_add_i32 s56, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 29 +; SI-NEXT: v_readlane_b32 s7, v62, 24 ; SI-NEXT: s_add_i32 s14, s75, 0x300 ; SI-NEXT: s_add_i32 s54, s74, 3 -; SI-NEXT: v_readlane_b32 s75, v61, 62 +; SI-NEXT: v_readlane_b32 s75, v61, 58 ; SI-NEXT: s_and_b32 s6, s56, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s74, s54, 0xff ; SI-NEXT: s_lshl_b32 s75, s75, 8 ; SI-NEXT: s_or_b32 s63, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 36 +; SI-NEXT: v_readlane_b32 s6, v62, 32 ; SI-NEXT: s_or_b32 s74, s75, s74 -; SI-NEXT: v_readlane_b32 s75, v61, 61 +; SI-NEXT: v_readlane_b32 s75, v61, 57 ; SI-NEXT: s_add_i32 s58, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 30 +; SI-NEXT: v_readlane_b32 s7, v62, 25 ; SI-NEXT: s_add_i32 s15, s76, 0x300 ; SI-NEXT: s_add_i32 s50, s75, 3 -; SI-NEXT: v_readlane_b32 s76, v61, 60 +; SI-NEXT: v_readlane_b32 s76, v61, 56 ; SI-NEXT: s_and_b32 s6, s58, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s75, s50, 0xff ; SI-NEXT: s_lshl_b32 s76, s76, 8 ; SI-NEXT: s_or_b32 s79, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 37 +; SI-NEXT: v_readlane_b32 s6, v62, 33 ; SI-NEXT: s_or_b32 s75, s76, s75 -; SI-NEXT: v_readlane_b32 s76, v61, 59 +; SI-NEXT: v_readlane_b32 s76, v61, 55 ; SI-NEXT: s_add_i32 s59, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 33 +; SI-NEXT: v_readlane_b32 s7, v62, 27 ; SI-NEXT: s_add_i32 s18, s77, 0x300 ; SI-NEXT: s_add_i32 s48, s76, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 58 +; SI-NEXT: v_readlane_b32 s77, v61, 54 ; SI-NEXT: s_and_b32 s6, s59, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s76, s48, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 8 ; SI-NEXT: s_or_b32 s78, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 35 +; SI-NEXT: v_readlane_b32 s6, v62, 31 ; SI-NEXT: s_or_b32 s76, s77, s76 -; SI-NEXT: v_readlane_b32 s77, v61, 57 +; SI-NEXT: v_readlane_b32 s77, v61, 53 ; SI-NEXT: s_add_i32 s57, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 32 +; SI-NEXT: v_readlane_b32 s7, v62, 26 ; SI-NEXT: s_add_i32 s11, s72, 0x300 ; SI-NEXT: s_add_i32 s72, s79, 0x300 ; SI-NEXT: s_add_i32 s37, s77, 3 -; SI-NEXT: v_readlane_b32 s79, v61, 56 +; SI-NEXT: v_readlane_b32 s79, v61, 52 ; SI-NEXT: s_and_b32 s6, s57, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s77, s37, 0xff ; SI-NEXT: s_lshl_b32 s79, s79, 8 ; SI-NEXT: s_or_b32 s88, s7, s6 ; SI-NEXT: s_or_b32 s77, s79, s77 -; SI-NEXT: v_readlane_b32 s79, v61, 55 +; SI-NEXT: v_readlane_b32 s79, v61, 51 ; SI-NEXT: s_add_i32 s21, s89, 0x300 ; SI-NEXT: s_add_i32 s89, s88, 0x300 ; SI-NEXT: s_add_i32 s35, s79, 3 -; SI-NEXT: v_readlane_b32 s88, v61, 54 +; SI-NEXT: v_readlane_b32 s88, v61, 50 ; SI-NEXT: s_and_b32 s79, s35, 0xff ; SI-NEXT: s_lshl_b32 s88, s88, 8 -; SI-NEXT: s_or_b32 s79, s88, s79 -; SI-NEXT: v_readlane_b32 s88, v61, 53 +; SI-NEXT: v_readlane_b32 s90, v61, 48 ; SI-NEXT: s_add_i32 s25, s92, 0x300 +; SI-NEXT: s_or_b32 s79, s88, s79 +; SI-NEXT: v_readlane_b32 s88, v61, 49 +; SI-NEXT: s_lshl_b32 s92, s90, 8 +; SI-NEXT: v_readlane_b32 s90, v61, 47 ; SI-NEXT: s_add_i32 s30, s88, 3 -; SI-NEXT: v_readlane_b32 s92, v61, 52 +; SI-NEXT: s_add_i32 s94, s90, 3 +; SI-NEXT: v_readlane_b32 s90, v61, 46 ; SI-NEXT: s_and_b32 s88, s30, 0xff -; SI-NEXT: s_lshl_b32 s92, s92, 8 +; SI-NEXT: s_lshl_b32 s91, s90, 8 +; SI-NEXT: v_readlane_b32 s90, v61, 45 ; SI-NEXT: s_or_b32 s88, s92, s88 -; SI-NEXT: v_readlane_b32 s92, v61, 51 -; SI-NEXT: s_add_i32 s94, s92, 3 ; SI-NEXT: s_and_b32 s92, s94, 0xff -; SI-NEXT: s_lshl_b32 s91, s91, 8 ; SI-NEXT: s_add_i32 s90, s90, 3 ; SI-NEXT: s_or_b32 s91, s91, s92 ; SI-NEXT: s_and_b32 s90, s90, 0xff -; SI-NEXT: s_lshl_b32 s92, s31, 8 +; SI-NEXT: s_lshl_b32 s92, s38, 8 ; SI-NEXT: s_or_b32 s90, s92, s90 -; SI-NEXT: v_readlane_b32 s92, v61, 50 -; SI-NEXT: s_add_i32 s92, s92, 3 +; SI-NEXT: v_readlane_b32 s92, v61, 44 ; SI-NEXT: s_add_i32 s26, s93, 0x300 +; SI-NEXT: s_add_i32 s92, s92, 3 +; SI-NEXT: v_readlane_b32 s93, v61, 43 ; SI-NEXT: s_and_b32 s92, s92, 0xff -; SI-NEXT: s_lshl_b32 s93, s95, 8 +; SI-NEXT: s_lshl_b32 s93, s93, 8 ; SI-NEXT: s_or_b32 s92, s93, s92 -; SI-NEXT: v_readlane_b32 s93, v61, 49 +; SI-NEXT: v_readlane_b32 s93, v61, 42 ; SI-NEXT: s_add_i32 s93, s93, 3 ; SI-NEXT: s_and_b32 s93, s93, 0xff -; SI-NEXT: s_lshl_b32 s94, s55, 8 +; SI-NEXT: s_lshl_b32 s94, s36, 8 ; SI-NEXT: s_or_b32 s93, s94, s93 -; SI-NEXT: v_readlane_b32 s94, v61, 48 +; SI-NEXT: v_readlane_b32 s94, v61, 41 ; SI-NEXT: s_add_i32 s94, s94, 3 ; SI-NEXT: s_and_b32 s94, s94, 0xff -; SI-NEXT: s_lshl_b32 s95, s99, 8 +; SI-NEXT: s_lshl_b32 s95, s31, 8 ; SI-NEXT: s_or_b32 s94, s95, s94 ; SI-NEXT: v_readlane_b32 s95, v61, 1 ; SI-NEXT: s_add_i32 s95, s95, 3 @@ -178812,30 +178837,30 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_add_i32 s6, vcc_lo, 0x300 ; SI-NEXT: s_and_b32 s95, s95, 0xff ; SI-NEXT: s_lshl_b32 vcc_lo, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 47 +; SI-NEXT: v_readlane_b32 s30, v61, 3 ; SI-NEXT: s_or_b32 s95, vcc_lo, s95 ; SI-NEXT: s_add_i32 vcc_lo, s30, 3 ; SI-NEXT: v_readlane_b32 s30, v61, 2 ; SI-NEXT: s_add_i32 s28, vcc_hi, 0x300 ; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 46 +; SI-NEXT: v_readlane_b32 s30, v61, 5 ; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo ; SI-NEXT: s_add_i32 vcc_hi, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 45 +; SI-NEXT: v_readlane_b32 s30, v61, 4 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s30, s30, 8 ; SI-NEXT: s_or_b32 vcc_hi, s30, vcc_hi -; SI-NEXT: v_readlane_b32 s30, v61, 44 +; SI-NEXT: v_readlane_b32 s30, v61, 7 ; SI-NEXT: s_add_i32 s30, s30, 3 -; SI-NEXT: v_readlane_b32 s31, v61, 43 +; SI-NEXT: v_readlane_b32 s31, v61, 6 ; SI-NEXT: s_and_b32 s30, s30, 0xff ; SI-NEXT: s_lshl_b32 s31, s31, 8 ; SI-NEXT: s_or_b32 s30, s31, s30 -; SI-NEXT: v_readlane_b32 s31, v61, 42 +; SI-NEXT: v_readlane_b32 s31, v61, 40 ; SI-NEXT: s_add_i32 s29, s34, 0x300 ; SI-NEXT: s_add_i32 s31, s31, 3 -; SI-NEXT: v_readlane_b32 s34, v61, 41 +; SI-NEXT: v_readlane_b32 s34, v61, 8 ; SI-NEXT: s_and_b32 s31, s31, 0xff ; SI-NEXT: s_lshl_b32 s34, s34, 8 ; SI-NEXT: s_or_b32 s31, s34, s31 @@ -178843,23 +178868,23 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v1, s31 ; SI-NEXT: s_addk_i32 s30, 0x300 ; SI-NEXT: s_addk_i32 vcc_hi, 0x300 -; SI-NEXT: v_readlane_b32 s34, v61, 40 +; SI-NEXT: v_readlane_b32 s34, v61, 39 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s30 ; SI-NEXT: s_add_i32 s34, s34, 3 -; SI-NEXT: v_readlane_b32 s35, v61, 39 +; SI-NEXT: v_readlane_b32 s35, v61, 38 ; SI-NEXT: s_and_b32 s34, s34, 0xff -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi ; SI-NEXT: s_lshl_b32 s35, s35, 8 ; SI-NEXT: s_addk_i32 vcc_lo, 0x300 ; SI-NEXT: s_or_b32 s34, s35, s34 -; SI-NEXT: v_readlane_b32 s35, v61, 38 +; SI-NEXT: v_readlane_b32 s35, v61, 37 ; SI-NEXT: s_add_i32 s35, s35, 3 -; SI-NEXT: v_readlane_b32 s36, v61, 37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s36, v61, 36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_lo ; SI-NEXT: s_and_b32 s35, s35, 0xff @@ -178915,10 +178940,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v5, s34 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v7, s95 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s94 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s93 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s92 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s90 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s91 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s88 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s79 @@ -178959,32 +178984,35 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v40, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s22 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s12 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: .LBB93_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 ; SI-NEXT: v_readlane_b32 s96, v63, 32 @@ -179030,7 +179058,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -179040,7 +179068,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -179055,7 +179083,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 ; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 @@ -179194,48 +179222,45 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v45 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v57 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x74, v0 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v59 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -179266,74 +179291,73 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_mov_b32 s17, s19 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s19, s50 +; SI-NEXT: s_mov_b32 s19, s54 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_mov_b32 s23, s48 -; SI-NEXT: s_mov_b32 s26, s37 -; SI-NEXT: s_mov_b32 s28, s29 -; SI-NEXT: s_mov_b32 s29, s76 -; SI-NEXT: s_mov_b32 s59, s58 +; SI-NEXT: s_mov_b32 s26, s50 +; SI-NEXT: s_mov_b32 s23, s35 +; SI-NEXT: s_mov_b32 s15, s75 +; SI-NEXT: s_mov_b32 s20, s76 +; SI-NEXT: s_mov_b32 s98, s59 +; SI-NEXT: s_mov_b32 s58, s57 ; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s46, s41 +; SI-NEXT: s_mov_b32 s41, s14 ; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: s_mov_b32 s7, s97 -; SI-NEXT: s_mov_b32 s97, s81 -; SI-NEXT: s_mov_b32 s81, s85 -; SI-NEXT: s_mov_b32 s6, s40 -; SI-NEXT: s_mov_b32 s40, s72 -; SI-NEXT: s_mov_b32 s45, s73 -; SI-NEXT: s_mov_b32 s15, s89 -; SI-NEXT: s_mov_b32 s24, s98 -; SI-NEXT: s_mov_b32 s20, s88 -; SI-NEXT: s_mov_b32 s99, s55 +; SI-NEXT: s_mov_b32 s7, s85 +; SI-NEXT: s_mov_b32 s28, s29 +; SI-NEXT: s_mov_b32 s29, s25 +; SI-NEXT: s_mov_b32 s85, s97 +; SI-NEXT: s_mov_b32 s97, s21 +; SI-NEXT: s_mov_b32 s25, s69 +; SI-NEXT: s_mov_b32 s69, s81 +; SI-NEXT: s_mov_b32 s37, s66 +; SI-NEXT: s_mov_b32 s66, s53 +; SI-NEXT: s_mov_b32 s53, s34 +; SI-NEXT: s_mov_b32 s34, s89 +; SI-NEXT: s_mov_b32 s89, s93 +; SI-NEXT: s_mov_b32 s44, s45 +; SI-NEXT: s_mov_b32 s13, s39 +; SI-NEXT: s_mov_b32 s88, s30 +; SI-NEXT: s_mov_b32 s38, s95 +; SI-NEXT: s_mov_b32 s94, s68 +; SI-NEXT: s_mov_b32 s36, s96 +; SI-NEXT: s_mov_b32 s31, s6 +; SI-NEXT: v_readlane_b32 s6, v61, 34 +; SI-NEXT: v_readlane_b32 s99, v61, 35 +; SI-NEXT: v_readlane_b32 s55, v61, 33 +; SI-NEXT: v_readlane_b32 s86, v61, 32 +; SI-NEXT: v_readlane_b32 s96, v61, 30 +; SI-NEXT: v_readlane_b32 s51, v61, 31 +; SI-NEXT: v_readlane_b32 s83, v61, 28 +; SI-NEXT: v_readlane_b32 s82, v61, 29 +; SI-NEXT: v_readlane_b32 s84, v61, 26 +; SI-NEXT: v_readlane_b32 s87, v61, 27 +; SI-NEXT: v_readlane_b32 s80, v61, 25 +; SI-NEXT: v_readlane_b32 s71, v61, 24 +; SI-NEXT: v_readlane_b32 s49, v61, 22 +; SI-NEXT: v_readlane_b32 s70, v61, 23 +; SI-NEXT: v_readlane_b32 s65, v61, 20 +; SI-NEXT: v_readlane_b32 s67, v61, 21 +; SI-NEXT: v_readlane_b32 s54, v61, 18 +; SI-NEXT: v_readlane_b32 s64, v61, 19 +; SI-NEXT: v_readlane_b32 s50, v61, 17 +; SI-NEXT: v_readlane_b32 s35, v61, 16 +; SI-NEXT: v_readlane_b32 s52, v61, 14 +; SI-NEXT: v_readlane_b32 s48, v61, 15 +; SI-NEXT: v_readlane_b32 s30, v61, 12 +; SI-NEXT: v_readlane_b32 s39, v61, 10 +; SI-NEXT: v_readlane_b32 s92, v61, 11 +; SI-NEXT: v_readlane_b32 s77, v61, 9 +; SI-NEXT: v_readlane_b32 s75, v61, 13 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_readlane_b32 s75, v61, 7 -; SI-NEXT: v_readlane_b32 s76, v61, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 5 -; SI-NEXT: v_readlane_b32 s78, v61, 4 -; SI-NEXT: v_readlane_b32 s92, v61, 6 -; SI-NEXT: v_readlane_b32 s39, v61, 9 -; SI-NEXT: v_readlane_b32 s37, v61, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 10 -; SI-NEXT: v_readlane_b32 s48, v61, 11 -; SI-NEXT: v_readlane_b32 s52, v61, 13 -; SI-NEXT: v_readlane_b32 s35, v61, 12 -; SI-NEXT: v_readlane_b32 s50, v61, 15 -; SI-NEXT: v_readlane_b32 s64, v61, 14 -; SI-NEXT: v_readlane_b32 s54, v61, 17 -; SI-NEXT: v_readlane_b32 s67, v61, 16 -; SI-NEXT: v_readlane_b32 s65, v61, 18 -; SI-NEXT: v_readlane_b32 s70, v61, 19 -; SI-NEXT: v_readlane_b32 s49, v61, 21 -; SI-NEXT: v_readlane_b32 s71, v61, 20 -; SI-NEXT: v_readlane_b32 s80, v61, 23 -; SI-NEXT: v_readlane_b32 s83, v61, 22 -; SI-NEXT: v_readlane_b32 s84, v61, 25 -; SI-NEXT: v_readlane_b32 s82, v61, 24 -; SI-NEXT: v_readlane_b32 s87, v61, 26 -; SI-NEXT: v_readlane_b32 s86, v61, 27 -; SI-NEXT: v_readlane_b32 s96, v61, 29 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: s_mov_b32 s55, s93 -; SI-NEXT: s_mov_b32 s95, s91 -; SI-NEXT: v_readlane_b32 s94, v61, 31 -; SI-NEXT: s_mov_b32 s31, s90 -; SI-NEXT: v_readlane_b32 s34, v61, 30 -; SI-NEXT: v_readlane_b32 s53, v61, 32 -; SI-NEXT: v_readlane_b32 s66, v61, 33 -; SI-NEXT: v_readlane_b32 s68, v61, 34 -; SI-NEXT: v_readlane_b32 s69, v61, 35 -; SI-NEXT: v_readlane_b32 s8, v61, 36 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr15 @@ -179374,16 +179398,16 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -235330,7 +235354,6 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: v_writelane_b32 v40, s86, 30 ; SI-NEXT: v_writelane_b32 v40, s87, 31 -; SI-NEXT: s_mov_b32 s74, s23 ; SI-NEXT: s_mov_b32 s72, s21 ; SI-NEXT: s_mov_b32 s61, s18 ; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane @@ -235341,6 +235364,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: v_writelane_b32 v41, s19, 2 ; SI-NEXT: v_writelane_b32 v41, s61, 3 ; SI-NEXT: v_writelane_b32 v41, s72, 4 +; SI-NEXT: s_mov_b32 s74, s23 ; SI-NEXT: v_writelane_b32 v41, s20, 5 ; SI-NEXT: v_writelane_b32 v41, s74, 6 ; SI-NEXT: s_mov_b32 s76, s25 @@ -235352,10 +235376,10 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_mov_b32 s88, s29 ; SI-NEXT: v_writelane_b32 v41, s26, 11 ; SI-NEXT: v_writelane_b32 v41, s88, 12 -; SI-NEXT: v_readfirstlane_b32 s77, v2 +; SI-NEXT: v_readfirstlane_b32 s23, v2 ; SI-NEXT: v_writelane_b32 v41, s28, 13 ; SI-NEXT: v_readfirstlane_b32 s79, v4 -; SI-NEXT: v_writelane_b32 v41, s77, 14 +; SI-NEXT: v_writelane_b32 v41, s23, 14 ; SI-NEXT: v_readfirstlane_b32 s90, v3 ; SI-NEXT: v_writelane_b32 v41, s79, 15 ; SI-NEXT: v_readfirstlane_b32 s91, v6 @@ -235389,7 +235413,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s84, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s23, v35 +; SI-NEXT: v_readfirstlane_b32 s77, v35 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s83, v36 ; SI-NEXT: s_waitcnt vmcnt(8) @@ -235482,7 +235506,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_lshl_b32 s96, s28, 16 ; SI-NEXT: s_lshl_b32 s62, s88, 16 ; SI-NEXT: s_lshl_b32 s97, s5, 16 -; SI-NEXT: s_lshl_b32 s99, s77, 16 +; SI-NEXT: s_lshl_b32 s99, s23, 16 ; SI-NEXT: s_lshl_b32 s85, s90, 16 ; SI-NEXT: s_lshl_b32 s86, s79, 16 ; SI-NEXT: s_lshl_b32 s81, s92, 16 @@ -235524,25 +235548,29 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_lshl_b32 s76, s59, 16 ; SI-NEXT: s_lshl_b32 s24, s58, 16 ; SI-NEXT: s_lshl_b32 s74, s87, 16 -; SI-NEXT: s_mov_b32 s77, s18 +; SI-NEXT: s_mov_b32 s23, s18 ; SI-NEXT: s_lshl_b32 s22, s18, 16 ; SI-NEXT: s_lshl_b32 s72, s83, 16 -; SI-NEXT: s_mov_b32 s79, s23 -; SI-NEXT: s_lshl_b32 s20, s23, 16 +; SI-NEXT: s_mov_b32 s79, s77 +; SI-NEXT: s_lshl_b32 s20, s77, 16 ; SI-NEXT: s_lshl_b32 s61, s84, 16 ; SI-NEXT: s_mov_b32 s18, s75 ; SI-NEXT: s_lshl_b32 s19, s75, 16 ; SI-NEXT: s_lshl_b32 s60, s80, 16 +; SI-NEXT: s_mov_b32 s77, s21 ; SI-NEXT: s_lshl_b32 s17, s21, 16 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB107_3 ; SI-NEXT: .LBB107_2: ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: s_mov_b32 s79, s23 +; SI-NEXT: s_mov_b32 s79, s77 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s23, s18 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: s_mov_b32 s77, s18 +; SI-NEXT: s_mov_b32 s77, s21 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: s_mov_b32 s18, s75 @@ -235610,8 +235638,6 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: .LBB107_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_mov_b32 s4, s60 @@ -235647,31 +235673,43 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_cbranch_vccnz .LBB107_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s80, s80, 3 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: v_readlane_b32 s6, v41, 30 +; SI-NEXT: s_and_b32 s4, s80, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: s_add_i32 s84, s84, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s13, s6, 16 ; SI-NEXT: v_readlane_b32 s6, v41, 29 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s84, 0xffff +; SI-NEXT: s_lshl_b32 s60, s18, 16 +; SI-NEXT: s_add_i32 s83, s83, 3 ; SI-NEXT: s_and_b32 s15, s15, 0xffff ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_lshl_b32 s11, s25, 16 ; SI-NEXT: s_add_i32 s25, s6, 3 ; SI-NEXT: v_readlane_b32 s6, v41, 28 +; SI-NEXT: s_or_b32 s5, s60, s5 +; SI-NEXT: s_and_b32 s60, s83, 0xffff +; SI-NEXT: s_lshl_b32 s61, s79, 16 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: s_lshl_b32 s15, s6, 16 ; SI-NEXT: v_readlane_b32 s6, v41, 27 +; SI-NEXT: s_or_b32 vcc_lo, s61, s60 +; SI-NEXT: s_lshl_b32 s61, s23, 16 ; SI-NEXT: s_add_i32 s23, s6, 3 ; SI-NEXT: v_readlane_b32 s6, v41, 26 ; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_lshl_b32 s20, s6, 16 ; SI-NEXT: v_readlane_b32 s6, v41, 25 -; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_add_i32 s7, s7, 3 @@ -235702,7 +235740,6 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_and_b32 s15, s23, 0xffff ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s60, s18, 16 ; SI-NEXT: s_or_b32 s15, s20, s15 ; SI-NEXT: s_and_b32 s20, s21, 0xffff ; SI-NEXT: s_or_b32 s6, s17, s6 @@ -235767,20 +235804,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_or_b32 s27, s28, s27 -; SI-NEXT: s_add_i32 s80, s80, 3 ; SI-NEXT: s_add_i32 s27, s27, 0x30000 -; SI-NEXT: s_and_b32 s4, s80, 0xffff -; SI-NEXT: s_add_i32 s84, s84, 3 ; SI-NEXT: s_and_b32 s28, s27, 0xffff0000 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s84, 0xffff -; SI-NEXT: s_add_i32 s83, s83, 3 ; SI-NEXT: s_add_i32 s26, s26, 0x30000 ; SI-NEXT: v_writelane_b32 v41, s28, 31 ; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: s_or_b32 s5, s60, s5 -; SI-NEXT: s_and_b32 s60, s83, 0xffff -; SI-NEXT: s_lshl_b32 s61, s79, 16 ; SI-NEXT: s_add_i32 s87, s87, 3 ; SI-NEXT: s_add_i32 s59, s59, 3 ; SI-NEXT: s_add_i32 s57, s57, 3 @@ -235790,9 +235818,7 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: v_writelane_b32 v41, s27, 32 ; SI-NEXT: s_and_b32 s27, s26, 0xffff0000 -; SI-NEXT: s_or_b32 vcc_lo, s61, s60 ; SI-NEXT: s_and_b32 s60, s87, 0xffff -; SI-NEXT: s_lshl_b32 s61, s77, 16 ; SI-NEXT: s_and_b32 s59, s59, 0xffff ; SI-NEXT: s_lshl_b32 s58, s58, 16 ; SI-NEXT: s_and_b32 s57, s57, 0xffff diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index 4a0bb6ceccd3f..e7eefafe31203 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -41,24 +41,26 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr18_sgpr19 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: SI_SPILL_S32_SAVE $sgpr15, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S32_SAVE $sgpr14, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: renamable $sgpr14_sgpr15 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0 ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.5, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.5, align 4, addrspace 5) ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec ; CHECK-NEXT: renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 1083786240 - ; CHECK-NEXT: SI_SPILL_S1024_SAVE renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S1024_SAVE renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.2, align 4, addrspace 5) ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]] @@ -67,9 +69,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.5(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr40 = COPY renamable $sgpr72 ; CHECK-NEXT: renamable $sgpr41 = COPY renamable $sgpr72 ; CHECK-NEXT: renamable $sgpr42 = COPY renamable $sgpr72 @@ -83,58 +85,58 @@ body: | ; CHECK-NEXT: renamable $sgpr50 = COPY renamable $sgpr72 ; CHECK-NEXT: renamable $sgpr51 = COPY killed renamable $sgpr72 ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 - ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr56 = COPY killed renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 ; CHECK-NEXT: renamable $sgpr52 = COPY renamable $sgpr56 ; CHECK-NEXT: renamable $sgpr53 = COPY killed renamable $sgpr76 ; CHECK-NEXT: renamable $sgpr56_sgpr57 = COPY renamable $sgpr52_sgpr53 ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 - ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 ; CHECK-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr56_sgpr57 ; CHECK-NEXT: renamable $sgpr54 = COPY killed renamable $sgpr76 ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 ; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50 = COPY renamable $sgpr52_sgpr53_sgpr54 - ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr48_sgpr49_sgpr50 ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 ; CHECK-NEXT: renamable $sgpr55 = COPY killed renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr56 = COPY killed renamable $sgpr72 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr57 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr58 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr59 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr60 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr61 = COPY killed renamable $sgpr80 - ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr62 = COPY killed renamable $sgpr80 - ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr63 = COPY killed renamable $sgpr80 - ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr64 = COPY killed renamable $sgpr80 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr65 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr66 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr67 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr68 = COPY killed renamable $sgpr84 ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = COPY renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; CHECK-NEXT: renamable $sgpr64 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr65 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr66 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr67 = COPY killed renamable $sgpr84 ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec @@ -142,32 +144,30 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.4(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16 + ; CHECK-NEXT: liveins: $sgpr16 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr60 = COPY killed renamable $sgpr14 - ; CHECK-NEXT: renamable $sgpr62 = COPY killed renamable $sgpr15 - ; CHECK-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr16, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: renamable $sgpr60 = COPY killed renamable $sgpr16 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, CustomRegMask($sgpr60,$sgpr62) ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.17(0x80000000) - ; CHECK-NEXT: liveins: $sgpr60, $sgpr62 + ; CHECK-NEXT: liveins: $sgpr60 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $sgpr12 = COPY killed renamable $sgpr60 - ; CHECK-NEXT: $sgpr13 = COPY killed renamable $sgpr62 - ; CHECK-NEXT: $sgpr14 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $sgpr12 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $sgpr13 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $sgpr14 = COPY killed renamable $sgpr60 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_noregs, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.6(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 killed renamable $sgpr12_sgpr13, undef renamable $sgpr54_sgpr55, implicit-def dead $scc ; CHECK-NEXT: renamable $sgpr54_sgpr55 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr12_sgpr13 @@ -175,33 +175,33 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: ; CHECK-NEXT: successors: %bb.7(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: ; CHECK-NEXT: successors: %bb.8(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr64_sgpr65 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec ; CHECK-NEXT: renamable $sgpr66_sgpr67 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V32_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr14, 11, implicit-def $m0, implicit $m0, implicit $exec + ; CHECK-NEXT: dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V32_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr4, 11, implicit-def $m0, implicit $m0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.8: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.9(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr64_sgpr65, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.10, implicit $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.9: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY2]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1) - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr18_sgpr19, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr14_sgpr15, implicit $exec ; CHECK-NEXT: dead renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec ; CHECK-NEXT: renamable $sgpr82 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 @@ -211,21 +211,17 @@ body: | ; CHECK-NEXT: $sgpr6_sgpr7 = COPY renamable $sgpr70_sgpr71 ; CHECK-NEXT: renamable $sgpr80_sgpr81 = COPY killed renamable $sgpr10_sgpr11 ; CHECK-NEXT: $sgpr10_sgpr11 = COPY renamable $sgpr80_sgpr81 - ; CHECK-NEXT: $sgpr12 = COPY renamable $sgpr14 - ; CHECK-NEXT: $sgpr13 = COPY renamable $sgpr15 + ; CHECK-NEXT: $sgpr12 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $sgpr13 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) ; CHECK-NEXT: renamable $sgpr84 = COPY killed renamable $sgpr8 ; CHECK-NEXT: renamable $sgpr33 = COPY killed renamable $sgpr16 - ; CHECK-NEXT: renamable $sgpr83 = COPY killed renamable $sgpr15 - ; CHECK-NEXT: renamable $sgpr85 = COPY killed renamable $sgpr14 - ; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr18_sgpr19 + ; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr14_sgpr15 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9 - ; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49 - ; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr85 - ; CHECK-NEXT: renamable $sgpr15 = COPY killed renamable $sgpr83 + ; CHECK-NEXT: renamable $sgpr14_sgpr15 = COPY killed renamable $sgpr48_sgpr49 ; CHECK-NEXT: renamable $sgpr16 = COPY killed renamable $sgpr33 ; CHECK-NEXT: renamable $sgpr4_sgpr5 = COPY killed renamable $sgpr68_sgpr69 ; CHECK-NEXT: renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr70_sgpr71 @@ -238,49 +234,49 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.10: ; CHECK-NEXT: successors: %bb.8(0x40000000), %bb.12(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.11: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.12: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.13(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr54_sgpr55 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.13: ; CHECK-NEXT: successors: %bb.15(0x40000000), %bb.14(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.5, align 4, addrspace 5) ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.15, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.14 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.14: ; CHECK-NEXT: successors: %bb.15(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.15: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.16(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr14_sgpr15, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5) ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.11, implicit $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.16: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16 + ; CHECK-NEXT: liveins: $sgpr16 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; CHECK-NEXT: {{ $}}